diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 000000000..5008ddfcf Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore index 3aef1432f..f98d6fb6f 100644 --- a/.gitignore +++ b/.gitignore @@ -20,4 +20,10 @@ nbactions*.xml nb-configuration.xml -eqtl-mapping-pipeline/nb-configuration.xml \ No newline at end of file +eqtl-mapping-pipeline/nb-configuration.xml +/eQTLInteractionAnalyser/nbproject/private/ +/eQTLInteractionAnalyser/build/ +/eQTLInteractionAnalyser/dist/ +/eQTLInteractionAnalyser2/build/ +/eQTLInteractionAnalyser2/dist/ +/eQTLInteractionAnalyser2/nbproject/private/ \ No newline at end of file diff --git a/BinaryMetaAnalyzer/pom.xml b/BinaryMetaAnalyzer/pom.xml index 09d582d05..d9314c876 100644 --- a/BinaryMetaAnalyzer/pom.xml +++ b/BinaryMetaAnalyzer/pom.xml @@ -7,13 +7,13 @@ 1.0.2-SNAPSHOT BinaryMetaAnalyzer - 1.0.4-SNAPSHOT + 1.0.7-SNAPSHOT jar ${project.groupId} genetica-libraries - 1.0.5 + 1.0.7-SNAPSHOT BinaryMetaAnalyzer diff --git a/BinaryMetaAnalyzer/src/main/java/nl/umcg/bondermj/microbialmetanalysis/BinaryMicrobePcaAnalysis.java b/BinaryMetaAnalyzer/src/main/java/nl/umcg/bondermj/microbialmetanalysis/BinaryMicrobePcaAnalysis.java index c75dbcec7..61f7602dc 100644 --- a/BinaryMetaAnalyzer/src/main/java/nl/umcg/bondermj/microbialmetanalysis/BinaryMicrobePcaAnalysis.java +++ b/BinaryMetaAnalyzer/src/main/java/nl/umcg/bondermj/microbialmetanalysis/BinaryMicrobePcaAnalysis.java @@ -105,9 +105,7 @@ public void run(int bufferSize) throws IOException { loadProbeAnnotation(); for (int permutation = 0; permutation < settings.getNrPermutations() + 1; permutation++) { - finalEQTLs = new QTL[bufferSize]; - locationToStoreResult = 0; - bufferHasOverFlown = false; + clearResultsBuffer(); maxSavedPvalue = -Double.MAX_VALUE; // create dataset objects System.out.println("Running permutation " + permutation); @@ -115,7 +113,7 @@ public void run(int bufferSize) throws IOException { System.out.println("Loading datasets"); for (int d = 0; d < datasets.length; d++) { - datasets[d] = new BinaryMetaAnalysisDataset(settings.getDatasetlocations().get(d), settings.getDatasetPrefix().get(d), permutation, settings.getDatasetannotations().get(d), probeAnnotation); + datasets[d] = new BinaryMetaAnalysisDataset(settings.getDatasetlocations().get(d), settings.getDatasetnames().get(d), settings.getDatasetPrefix().get(d), permutation, settings.getDatasetannotations().get(d), probeAnnotation); } System.out.println("Loaded " + datasets.length + " datasets"); @@ -224,7 +222,7 @@ public void run(int bufferSize) throws IOException { for (int probe = 0; probe < traitList.length; probe++) { double metaAnalysisZ = ZScores.getWeightedZ(finalZScores[probe], sampleSizes); double tScore = ZScores.zScoreToCorrelation(metaAnalysisZ, totalSampleSize); - summedRsquare += tScore*tScore; + summedRsquare += tScore * tScore; } double newMetaZ = Correlation.convertCorrelationToZScore(totalSampleSize, Math.sqrt(summedRsquare)); double newMetaAnalysisP = Descriptives.convertZscoreToPvalue(newMetaZ); @@ -244,10 +242,10 @@ public void run(int bufferSize) throws IOException { double metaAnalysisZ = ZScores.getWeightedZ(finalZScores[probe], sampleSizes); for (int i = 0; i < finalZScores[probe].length; i++) { double tScore = ZScores.zScoreToCorrelation(finalZScores[probe][i], sampleSizes[i]); - summedPerDataSet[i] += tScore*tScore; + summedPerDataSet[i] += tScore * tScore; } double tScore = ZScores.zScoreToCorrelation(metaAnalysisZ, totalSampleSize); - summedRsquare += tScore*tScore; + summedRsquare += tScore * tScore; } for (int i = 0; i < summedPerDataSet.length; i++) { @@ -261,7 +259,7 @@ public void run(int bufferSize) throws IOException { MetaQTL4MetaTrait t = new MetaQTL4MetaTrait(21, "Microbe_Components", "-", -1, -1, "", traitList[0].getPlatformIds()); QTL q = new QTL(newMetaAnalysisP, t, snp, BaseAnnot.toByte(alleleAssessed), newMetaZ, BaseAnnot.toByteArray(alleles), summedPerDataSet, sampleSizes); // sort buffer if needed. addEQTL(q); - } else { + } else { System.out.println("Error in procedure."); } } @@ -546,4 +544,11 @@ private void writeBuffer(String outdir, int permutation) throws IOException { System.out.println( "Done."); } + + private void clearResultsBuffer() { + Arrays.fill(finalEQTLs, null); + bufferHasOverFlown = false; + locationToStoreResult = 0; + maxSavedPvalue = -Double.MAX_VALUE; + } } diff --git a/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysis.java b/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysis.java index 03c109bcf..2c9167723 100644 --- a/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysis.java +++ b/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysis.java @@ -46,7 +46,7 @@ public static void main(String[] args) { settingsFile = args[0]; } else { - System.out.println("Usage: settings.xml replacetext replacetextwith"); + System.out.println("Usage of the binary meta-analysis: settings.xml replacetext replacetextwith"); System.exit(-1); } @@ -54,7 +54,7 @@ public static void main(String[] args) { System.exit(0); } - + private MetaQTL4TraitAnnotation probeAnnotation; private BinaryMetaAnalysisDataset[] datasets = new BinaryMetaAnalysisDataset[0]; private int[][] snpIndex; @@ -105,24 +105,36 @@ public void run() throws IOException { System.out.println("Loading probe annotation from: " + settings.getProbetranslationfile()); loadProbeAnnotation(); + if (traitList.length == 0) { + System.err.println("Error: no annotation loaded."); + System.exit(-1); + } + for (int permutation = 0; permutation < settings.getNrPermutations() + 1; permutation++) { + clearResultsBuffer(); + // create dataset objects System.out.println("Running permutation " + permutation); datasets = new BinaryMetaAnalysisDataset[settings.getDatasetlocations().size()]; System.out.println("Loading datasets"); for (int d = 0; d < datasets.length; d++) { - datasets[d] = new BinaryMetaAnalysisDataset(settings.getDatasetlocations().get(d), settings.getDatasetPrefix().get(d), permutation, settings.getDatasetannotations().get(d), probeAnnotation); + datasets[d] = new BinaryMetaAnalysisDataset(settings.getDatasetlocations().get(d), + settings.getDatasetnames().get(d), + settings.getDatasetPrefix().get(d), + permutation, + settings.getDatasetannotations().get(d), + probeAnnotation); } System.out.println("Loaded " + datasets.length + " datasets"); // create meta-analysis SNP index. have to recreate this every permutation, // since the order of SNPs is generated at random. System.out.println("Creating SNP index"); - createSNPIndex(); + createSNPIndex(outdir); System.out.println("Total of " + snpIndex.length + " SNPs"); System.out.println("Creating probe index"); - createProbeIndex(); + createProbeIndex(outdir); System.out.println("Total of " + probeIndex.length + " probes"); if (snpChr == null) { @@ -245,9 +257,10 @@ public void run() throws IOException { double metaZ = ZScores.getWeightedZ(finalZScores[probe], sampleSizes); double p = Descriptives.convertZscoreToPvalue(metaZ); - if (!Double.isNaN(p)) { + if (!Double.isNaN(p) && !Double.isNaN(metaZ)) { // create output object QTL q = new QTL(p, t, snp, BaseAnnot.toByte(alleleAssessed), metaZ, BaseAnnot.toByteArray(alleles), finalZScores[probe], sampleSizes); // sort buffer if needed. +// System.out.println(q.getSNPId()+"\t"+q.getMetaTrait().getMetaTraitName()+"\t"+q.toString()); addEQTL(q); } else { // if (!printed) { @@ -307,8 +320,9 @@ public void run() throws IOException { double metaAnalysisP = Descriptives.convertZscoreToPvalue(metaAnalysisZ); // create output object - if (!Double.isNaN(metaAnalysisP)) { + if (!Double.isNaN(metaAnalysisP) && !Double.isNaN(metaAnalysisZ)) { QTL q = new QTL(metaAnalysisP, t, snp, BaseAnnot.toByte(alleleAssessed), metaAnalysisZ, BaseAnnot.toByteArray(alleles), finalZScores[probe], sampleSizes); // sort buffer if needed. +// System.out.println(q.getSNPId()+"\t"+q.getMetaTrait().getMetaTraitName()+"\t"+q.toString()); addEQTL(q); } } @@ -332,7 +346,7 @@ public void run() throws IOException { */ } - private void createSNPIndex() throws IOException { + private void createSNPIndex(String outdir) throws IOException { HashSet confineToTheseSNPs = null; if (settings.getSNPSelection() != null) { @@ -384,6 +398,21 @@ private void createSNPIndex() throws IOException { } } } + + TextFile tf = new TextFile(outdir + "snpindex.txt", TextFile.W); + String header = "metaID"; + for (int d = 0; d < datasets.length; d++) { + header += "\t" + datasets[d].getName() + "-sid"; + } + tf.writeln(header); + for (int s = 0; s < snpList.length; s++) { + String ln = snpList[s]; + for (int d = 0; d < datasets.length; d++) { + ln += "\t" + snpIndex[s][d]; + } + tf.writeln(ln); + } + tf.close(); } private void loadProbeAnnotation() throws IOException { @@ -432,31 +461,73 @@ private void loadSNPAnnotation() throws IOException { } // index the probes - private void createProbeIndex() throws IOException { - + private void createProbeIndex(String outdir) throws IOException { + HashSet confineToTheseProbes = null; - if (settings.getProbeselection()!= null) { + if (settings.getProbeselection() != null) { System.out.println("Selecting Probes from file: " + settings.getProbeselection()); confineToTheseProbes = new HashSet(); TextFile tf = new TextFile(settings.getProbeselection(), TextFile.R); confineToTheseProbes.addAll(tf.readAsArrayList()); tf.close(); - System.out.println(confineToTheseProbes.size() + " Probes loaded."); } - + + System.out.println(""); probeIndex = new Integer[traitList.length][datasets.length]; + for (int d = 0; d < datasets.length; d++) { String[] probes = datasets[d].getProbeList(); - int platformId = probeAnnotation.getPlatformId(settings.getDatasetannotations().get(d)); + int platformId = probeAnnotation.getPlatformId(datasets[d].getPlatform()); + + HashMap traitHashForPlatform = probeAnnotation.getTraitHashForPlatform(platformId); + System.out.println(probeAnnotation.getTraitHashPerPlatform().size()); + + System.out.println(datasets[d].getName() + "\t" + platformId + "\t" + datasets[d].getPlatform() + "\t" + traitHashForPlatform.size()); for (int p = 0; p < probes.length; p++) { + + MetaQTL4MetaTrait t = traitHashForPlatform.get(probes[p]); + int index = traitMap.get(t); + + if (probes[p].equals("60437")) { + if (t != null) { + System.out.println(t.getMetaTraitId()); + } else { + System.out.println("not found"); + } + } + if (confineToTheseProbes == null || confineToTheseProbes.contains(probes[p])) { - MetaQTL4MetaTrait t = probeAnnotation.getTraitForPlatformId(platformId, probes[p]); - int index = traitMap.get(t); probeIndex[index][d] = p; } } } + + System.out.println(""); + + TextFile out = new TextFile(outdir + "probeindex.txt", TextFile.W); + + String header = "metaID"; + for (int d = 0; d < datasets.length; d++) { + header += "\t" + datasets[d].getName() + "-pid\t" + datasets[d].getName() + "-probename"; + } + out.writeln(header); + for (int p = 0; p < probeIndex.length; p++) { + + String lnout = "" + traitList[p].getMetaTraitId(); + for (int d = 0; d < datasets.length; d++) { + Integer pid = probeIndex[p][d]; + String probeName = null; + if (pid != null) { + probeName = datasets[d].getProbeList()[pid]; + } + lnout += "\t" + pid + "\t" + probeName; + } + + out.writeln(lnout); + } + + out.close(); } private void addEQTL(QTL q) { @@ -529,10 +600,10 @@ private void writeBuffer(String outdir, int permutation) throws IOException { + "Meta-Beta (SE)\t" + "Beta (SE)\t" + "FoldChange"; - + output.writeln(header); // PValue SNPName SNPChr SNPChrPos ProbeName ProbeChr ProbeCenterChrPos CisTrans SNPType AlleleAssessed OverallZScore DatasetsWhereSNPProbePairIsAvailableAndPassesQC DatasetsZScores DatasetsNrSamples IncludedDatasetsMeanProbeExpression IncludedDatasetsProbeExpressionVariance HGNCName IncludedDatasetsCorrelationCoefficient Meta-Beta (SE) Beta (SE) FoldChange FDR - + DecimalFormat format = new DecimalFormat("###.#######", new DecimalFormatSymbols(Locale.US)); DecimalFormat smallFormat = new DecimalFormat("0.#####E0", new DecimalFormatSymbols(Locale.US)); for (int i = 0; i < settings.getFinalEQTLBufferMaxLength(); i++) { @@ -580,13 +651,20 @@ private void writeBuffer(String outdir, int permutation) throws IOException { float[] datasetZScores = q.getDatasetZScores(); String[] dsBuilder = new String[datasets.length]; String[] dsNBuilder = new String[datasets.length]; + String[] dsZBuilder = new String[datasets.length]; + for (int d = 0; d < datasetZScores.length; d++) { + if (!Float.isNaN(datasetZScores[d])) { + String str = format.format(datasetZScores[d]); + dsBuilder[d] = settings.getDatasetnames().get(d); dsNBuilder[d] = "" + q.getDatasetSampleSizes()[d]; + dsZBuilder[d] = str; } else { dsBuilder[d] = "-"; dsNBuilder[d] = "-"; + dsZBuilder[d] = "-"; } } @@ -594,7 +672,7 @@ private void writeBuffer(String outdir, int permutation) throws IOException { sb.append(Strings.concat(dsBuilder, Strings.semicolon)); sb.append("\t"); - sb.append(Strings.concat(datasetZScores, format, Strings.semicolon)); + sb.append(Strings.concat(dsZBuilder, Strings.semicolon)); sb.append("\t"); sb.append(Strings.concat(dsNBuilder, Strings.semicolon)); @@ -612,4 +690,11 @@ private void writeBuffer(String outdir, int permutation) throws IOException { System.out.println( "Done."); } + + private void clearResultsBuffer() { + Arrays.fill(finalEQTLs, null); + bufferHasOverFlown = false; + locationToStoreResult = 0; + maxSavedPvalue = -Double.MAX_VALUE; + } } diff --git a/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysisDataset.java b/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysisDataset.java index 7b9657a3b..4af125e99 100644 --- a/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysisDataset.java +++ b/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/BinaryMetaAnalysisDataset.java @@ -38,11 +38,16 @@ public class BinaryMetaAnalysisDataset { private final int platformId; private RandomAccessFile raf; - public BinaryMetaAnalysisDataset(String dir, String prefix, int permutation, String platform, MetaQTL4TraitAnnotation probeAnnotation) throws IOException { + private String name = null; + private String platform = null; + + public BinaryMetaAnalysisDataset(String dir, String name, String prefix, int permutation, String platform, MetaQTL4TraitAnnotation probeAnnotation) throws IOException { dir = Gpio.formatAsDirectory(dir); String matrix = dir; String probeFile = dir; String snpFile = dir; + this.platform = platform; + this.name = name; this.probeAnnotation = probeAnnotation; this.platformId = probeAnnotation.getPlatformId(platform); String pref = "Dataset"; @@ -265,4 +270,12 @@ public void close() throws IOException { raf.close(); } + public String getName() { + return name; + } + + public String getPlatform() { + return platform; + } + } diff --git a/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/MetaQTL4TraitAnnotation.java b/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/MetaQTL4TraitAnnotation.java index 8737b2175..e00e376da 100644 --- a/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/MetaQTL4TraitAnnotation.java +++ b/BinaryMetaAnalyzer/src/main/java/nl/umcg/westrah/binarymetaanalyzer/MetaQTL4TraitAnnotation.java @@ -67,8 +67,9 @@ public MetaQTL4TraitAnnotation(File probeAnnotationFile, Set platformsTo } } - int probeCounter = 0; + + // parse lines for (String[] elems : tf.readLineElemsIterable(TextFile.tab)) { String metaTraitName = elems[0]; @@ -89,12 +90,14 @@ public MetaQTL4TraitAnnotation(File probeAnnotationFile, Set platformsTo } String hugo = elems[4]; + String[] platformIds = new String[nrPlatforms]; // int metaTraitId, String metaTraitName, String chr, int chrStart, int chrEnd, String annotation, String[] platformIds + MetaQTL4MetaTrait metaTraitObj = new MetaQTL4MetaTrait(probeCounter, metaTraitName, chr, chrstartpos, chrendpos, hugo, platformIds); + platformNr = 0; for (int i = 5; i < elems.length; i++) { - platformNr = 0; if (colsToInclude[i]) { platformIds[platformNr] = elems[i]; HashMap probeToId = traitHashPerPlatform.get(platformNr); @@ -102,6 +105,7 @@ public MetaQTL4TraitAnnotation(File probeAnnotationFile, Set platformsTo platformNr++; } } + probeCounter++; metatraits.add(metaTraitObj); metaTraitNameToObj.put(metaTraitName, metaTraitObj); @@ -118,6 +122,10 @@ public MetaQTL4MetaTrait getTraitForPlatformId(Integer platformId, String platfo return traitHashPerPlatform.get(platformId).get(platformTrait); } + public HashMap getTraitHashForPlatform(Integer platformId) { + return traitHashPerPlatform.get(platformId); + } + public String[] getPlatforms() { return platforms; } diff --git a/Genotype-Harmonizer/nb-configuration.xml b/Genotype-Harmonizer/nb-configuration.xml index 5f8c56a87..4c7dcce1c 100644 --- a/Genotype-Harmonizer/nb-configuration.xml +++ b/Genotype-Harmonizer/nb-configuration.xml @@ -12,4 +12,13 @@ Without this configuration present, some functionality in the IDE may be limited + + + JDK_1.7 + diff --git a/Genotype-Harmonizer/pom.xml b/Genotype-Harmonizer/pom.xml index d5142e475..915a595c5 100644 --- a/Genotype-Harmonizer/pom.xml +++ b/Genotype-Harmonizer/pom.xml @@ -7,14 +7,14 @@ 4.0.0 Genotype-Harmonizer - 1.4.12-SNAPSHOT + 1.4.16-SNAPSHOT Genotype Harmonizer jar nl.systemsgenetics Genotype-IO - 1.0.1 + 1.0.2-SNAPSHOT commons-cli @@ -108,6 +108,15 @@ + + org.apache.maven.plugins + maven-compiler-plugin + 2.3.2 + + 1.7 + 1.7 + + diff --git a/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/Aligner.java b/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/Aligner.java index 27153622f..1a68bf6cb 100644 --- a/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/Aligner.java +++ b/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/Aligner.java @@ -1,7 +1,6 @@ package nl.umcg.deelenp.genotypeharmonizer; import static JSci.maths.ArrayMath.covariance; -import static JSci.maths.ArrayMath.variance; import com.google.common.collect.Lists; import java.io.BufferedWriter; import java.io.File; @@ -69,21 +68,21 @@ public ModifiableGenotypeData alignToRef(RandomAccessGenotypeData study, RandomA //In this loop we filter the variants present in the reference and swap the AG, AC, TC, TG SNPs. studyVariants: for (ModifiableGeneticVariant studyVariant : aligendStudyData.getModifiableGeneticVariants()) { - + ++iterationCounter; if (iterationCounter % 10000 == 0) { //LOGGER.info("Iteration 1 - " + GenotypeHarmonizer.DEFAULT_NUMBER_FORMATTER.format(iterationCounter) + " variants processed"); System.out.println("Iteration 1 - " + GenotypeHarmonizer.DEFAULT_NUMBER_FORMATTER.format(iterationCounter) + " variants processed"); } - + if (!studyVariant.isMapped()) { snpLogWriter.addToLog(studyVariant, SnpLogWriter.Actions.EXCLUDED, "No mapping"); studyVariant.exclude(); continue studyVariants; } - - if(studyVariant.getStartPos() == 0){ + + if (studyVariant.getStartPos() == 0) { snpLogWriter.addToLog(studyVariant, SnpLogWriter.Actions.EXCLUDED, "No mapping"); studyVariant.exclude(); continue studyVariants; @@ -176,19 +175,19 @@ public ModifiableGenotypeData alignToRef(RandomAccessGenotypeData study, RandomA //If we get here we have found a variant is our reference data on the same position with comparable alleles. - //We have to exclude maf of zero otherwise we cannot do LD calculation - if (!(studyVariant.getMinorAlleleFrequency() > 0)) { - snpLogWriter.addToLog(studyVariant, SnpLogWriter.Actions.EXCLUDED, "MAF of 0 in study data"); - studyVariant.exclude(); - continue studyVariants; - } - - //We have to exclude maf of zero otherwise we can not do LD calculation - if (!(refVariant.getMinorAlleleFrequency() > 0)) { - snpLogWriter.addToLog(studyVariant, SnpLogWriter.Actions.EXCLUDED, "MAF of 0 in reference data"); - studyVariant.exclude(); - continue studyVariants; - } +// //We have to exclude maf of zero otherwise we cannot do LD calculation +// if (!(studyVariant.getMinorAlleleFrequency() > 0)) { +// snpLogWriter.addToLog(studyVariant, SnpLogWriter.Actions.EXCLUDED, "MAF of 0 in study data"); +// studyVariant.exclude(); +// continue studyVariants; +// } +// +// //We have to exclude maf of zero otherwise we can not do LD calculation +// if (!(refVariant.getMinorAlleleFrequency() > 0)) { +// snpLogWriter.addToLog(studyVariant, SnpLogWriter.Actions.EXCLUDED, "MAF of 0 in reference data"); +// studyVariant.exclude(); +// continue studyVariants; +// } @@ -238,8 +237,8 @@ public ModifiableGenotypeData alignToRef(RandomAccessGenotypeData study, RandomA if (updateId) { snpUpdateWriter.close(); } - - if(iterationCounter == 0){ + + if (iterationCounter == 0) { throw new GenotypeAlignmentException("No variants where found in the input genotype data. Please check your variant filter options"); } @@ -283,7 +282,7 @@ public ModifiableGenotypeData alignToRef(RandomAccessGenotypeData study, RandomA if (!studyVariant.isAtOrGcSnp()) { //Correlate the haps with both these snps between study and ref - correlationResults hapCor = correlateHaplotypes(minLdToIncludeAlign, + CorrelationResults hapCor = correlateHaplotypes(minLdToIncludeAlign, flankSnpsToConsider, studyVariantList, refVariantList, variantIndex, studyVariant, refVariant); @@ -345,7 +344,7 @@ public ModifiableGenotypeData alignToRef(RandomAccessGenotypeData study, RandomA ++GcAtSnpsEncountered; //Correlate the haps with both these snps between study and ref - correlationResults hapCor = correlateHaplotypes(minLdToIncludeAlign, + CorrelationResults hapCor = correlateHaplotypes(minLdToIncludeAlign, flankSnpsToConsider, studyVariantList, refVariantList, variantIndex, studyVariant, refVariant); @@ -396,7 +395,7 @@ public ModifiableGenotypeData alignToRef(RandomAccessGenotypeData study, RandomA //Ld pattern should be okay now. but we are going to do the extra check //Correlate the haps with both these snps between study and ref - correlationResults hapCorSwapped = correlateHaplotypes(minLdToIncludeAlign, + CorrelationResults hapCorSwapped = correlateHaplotypes(minLdToIncludeAlign, flankSnpsToConsider, studyVariantList, refVariantList, variantIndex, studyVariant, refVariant); @@ -445,7 +444,7 @@ public ModifiableGenotypeData alignToRef(RandomAccessGenotypeData study, RandomA } - private correlationResults correlateHaplotypes(double minLdToIncludeAlignBase, + private CorrelationResults correlateHaplotypes(double minLdToIncludeAlignBase, int flankSnpsToConsider, ArrayList studyVariantList, ArrayList refVariantList, int variantIndex, @@ -488,36 +487,12 @@ private correlationResults correlateHaplotypes(double minLdToIncludeAlignBase, ldStudy = LdCalculator.calculateLd(snpStudyVariant, otherSnpStudyVariant); ldRef = LdCalculator.calculateLd(refVariant, otherRefVariant); } catch (LdCalculatorException e) { - LOGGER.warn("Error in LD calculation, skipping this comparison when comparing haplotype structure. Following error occurred: " + e.getMessage()); + LOGGER.debug("Error in LD calculation, skipping this comparison when comparing haplotype structure. Following error occurred: " + e.getMessage()); continue; } -// if(snpStudyVariant.getPrimaryVariantId().equals("rs1001945")){ -// LOGGER.debug(" * Other variant: " + otherSnpStudyVariant.getPrimaryVariantId() + -// "\nstudy alleles: " + otherSnpStudyVariant.getVariantAlleles() + " ref alleles: " + otherRefVariant.getVariantAlleles() + "\n" -// + "maf study: " + otherSnpStudyVariant.getMinorAlleleFrequency() + "(" + otherSnpStudyVariant.getMinorAllele() + ") maf ref: " + otherRefVariant.getMinorAlleleFrequency() + "(" + otherRefVariant.getMinorAllele() + ")\n" + -// "LD study, R2: " + ldStudy.getR2() + " D': " + ldStudy.getDPrime() + "\n" + -// "LD ref, R2: " + ldRef.getR2() + " D': " + ldRef.getDPrime() + "\n"); -// -// -// StringBuilder s = new StringBuilder(); -// for(byte b : snpStudyVariant.getSampleCalledDosages()){ -// s.append(b); -// } -// LOGGER.debug(s); -// -// s = new StringBuilder(); -// for(byte b : otherSnpStudyVariant.getSampleCalledDosages()){ -// s.append(b); -// } -// LOGGER.debug(s); -// -// -// -// -// } //only use SNPs with min R2 in both study as ref - if (ldStudy.getR2() >= minLdToIncludeAlignBase && ldRef.getR2() >= minLdToIncludeAlignBase) { + if ( !Double.isNaN(ldStudy.getR2()) && !Double.isNaN(ldRef.getR2()) && ldStudy.getR2() >= minLdToIncludeAlignBase && ldRef.getR2() >= minLdToIncludeAlignBase) { //Put in tree map to sort haplotypes. This can differ in the case of different reference allele TreeMap studyHapFreq = new TreeMap(ldStudy.getHaplotypesFreq()); @@ -542,14 +517,14 @@ private correlationResults correlateHaplotypes(double minLdToIncludeAlignBase, ++posCor; } - } + } } } - return new correlationResults(posCor, negCor); + return new CorrelationResults(posCor, negCor); } private double[] createDoubleArrayFromCollection( @@ -567,12 +542,12 @@ private double[] createDoubleArrayFromCollection( return array; } - private static class correlationResults { + private static class CorrelationResults { private final int posCor; private final int negCor; - public correlationResults(int posCor, int negCor) { + public CorrelationResults(int posCor, int negCor) { super(); this.posCor = posCor; this.negCor = negCor; diff --git a/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerParamaters.java b/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerParamaters.java index 29d70adf1..8ce8b3cdc 100644 --- a/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerParamaters.java +++ b/Genotype-Harmonizer/src/main/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerParamaters.java @@ -282,7 +282,7 @@ public GenotypeHarmonizerParamaters(String... args) throws ParseException { try { if (commandLine.hasOption('I')) { - inputType = RandomAccessGenotypeDataReaderFormats.valueOf(commandLine.getOptionValue('I').toUpperCase()); + inputType = RandomAccessGenotypeDataReaderFormats.valueOfSmart(commandLine.getOptionValue('I').toUpperCase()); } else { if (inputBasePaths[0].endsWith(".vcf")) { throw new ParseException("Only vcf.gz is supported. Please see manual on how to do create a vcf.gz file."); diff --git a/Genotype-Harmonizer/src/test/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerTest.java b/Genotype-Harmonizer/src/test/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerTest.java index 4fc4fa4c7..a4d338bac 100644 --- a/Genotype-Harmonizer/src/test/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerTest.java +++ b/Genotype-Harmonizer/src/test/java/nl/umcg/deelenp/genotypeharmonizer/GenotypeHarmonizerTest.java @@ -115,7 +115,7 @@ public void testMain() throws Exception { } - assertEquals(variantCounter, 3745); + assertEquals(variantCounter, 3747); //Check if ID is updated based on 1000G assertEquals(aligenedHapmap3Data.getSnpVariantByPos("20", 809930).getPrimaryVariantId(), "rs78472400"); @@ -183,7 +183,7 @@ public void testMain2() throws Exception { } - assertEquals(variantCounter, 4086); + assertEquals(variantCounter, 4088); //Check if number of samples is correct assertEquals(aligenedHapmap3Data.getSamples().size(), 165); @@ -252,7 +252,7 @@ public void testMain3() throws Exception { } - assertEquals(variantCounter, 4086); + assertEquals(variantCounter, 4088); //Check if ID is updated based on 1000G assertEquals(aligenedHapmap3Data.getSnpVariantByPos("20", 809930).getPrimaryVariantId(), "rs78472400"); @@ -358,7 +358,7 @@ public void testMain5() throws Exception { } - assertEquals(variantCounter, 3778); + assertEquals(variantCounter, 3780); //Check if ID is updated based on 1000G assertEquals(aligenedHapmap3Data.getSnpVariantByPos("20", 809930).getPrimaryVariantId(), "rs78472400"); @@ -423,7 +423,7 @@ public void testMain6() throws Exception { } - assertEquals(variantCounter, 3745); + assertEquals(variantCounter, 3747); //Check if ID is updated based on 1000G assertEquals(aligenedHapmap3Data.getSnpVariantByPos("20", 809930).getPrimaryVariantId(), "rs78472400"); @@ -486,7 +486,7 @@ public void testMain7() throws Exception { } - assertEquals(variantCounter, 4078); + assertEquals(variantCounter, 4087); //Check if number of samples is correct assertEquals(aligenedHapmap3Data.getSamples().size(), 155); diff --git a/Genotype-IO/pom.xml b/Genotype-IO/pom.xml index da4b14682..d071df3b4 100644 --- a/Genotype-IO/pom.xml +++ b/Genotype-IO/pom.xml @@ -82,6 +82,8 @@ 2.3.2 UTF-8 + 1.7 + 1.7 diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/AbstractRandomAccessGenotypeData.java b/Genotype-IO/src/main/java/org/molgenis/genotype/AbstractRandomAccessGenotypeData.java index 8770c6bc7..6335d7538 100644 --- a/Genotype-IO/src/main/java/org/molgenis/genotype/AbstractRandomAccessGenotypeData.java +++ b/Genotype-IO/src/main/java/org/molgenis/genotype/AbstractRandomAccessGenotypeData.java @@ -6,15 +6,14 @@ import org.molgenis.genotype.variant.GeneticVariant; import org.molgenis.genotype.variantFilter.VariantFilter; -public abstract class AbstractRandomAccessGenotypeData extends AbstractGenotypeData implements RandomAccessGenotypeData -{ +public abstract class AbstractRandomAccessGenotypeData extends AbstractGenotypeData implements RandomAccessGenotypeData { + + private HashMap fullVariantMap = null; + @Override - public Sequence getSequenceByName(String name) - { - for (Sequence sequence : getSequences()) - { - if (sequence.getName().equals(name)) - { + public Sequence getSequenceByName(String name) { + for (Sequence sequence : getSequences()) { + if (sequence.getName().equals(name)) { return sequence; } } @@ -23,14 +22,11 @@ public Sequence getSequenceByName(String name) } @Override - public GeneticVariant getSnpVariantByPos(String seqName, int startPos) - { + public GeneticVariant getSnpVariantByPos(String seqName, int startPos) { Iterable variants = getVariantsByPos(seqName, startPos); - for (GeneticVariant variant : variants) - { - if (variant.isSnp()) - { + for (GeneticVariant variant : variants) { + if (variant.isSnp()) { // only one SNP possible per position. Returning this SNP only return variant; } @@ -42,59 +38,63 @@ public GeneticVariant getSnpVariantByPos(String seqName, int startPos) @Override public HashMap getVariantIdMap() { - return getVariantIdMap(null); + + if (fullVariantMap == null) { + fullVariantMap = getVariantIdMap(null); + } + return fullVariantMap; + } + @Override + public void clearVariantIdMap(){ + fullVariantMap = null; + } + @Override public HashMap getVariantIdMap(VariantFilter filter) { - + HashMap variantIdMap = new HashMap(); - - for(GeneticVariant variant : this){ - if( variant.getVariantId().getPrimairyId() != null && !variant.getPrimaryVariantId().equals("") && (filter == null || filter.doesVariantPassFilter(variant))){ + + for (GeneticVariant variant : this) { + if (variant.getVariantId().getPrimairyId() != null && !variant.getPrimaryVariantId().equals("") && (filter == null || filter.doesVariantPassFilter(variant))) { variantIdMap.put(variant.getPrimaryVariantId(), variant); } } - + return variantIdMap; - + } @Override - public Iterator iterator() - { + public Iterator iterator() { return new GeneticVariantsIterator(this); } - private static class GeneticVariantsIterator implements Iterator - { + private static class GeneticVariantsIterator implements Iterator { + private Iterator seqNames; private Iterator seqGeneticVariants; private RandomAccessGenotypeData randomAccessGenotypeData; - public GeneticVariantsIterator(RandomAccessGenotypeData randomAccessGenotypeData) - { + public GeneticVariantsIterator(RandomAccessGenotypeData randomAccessGenotypeData) { seqNames = randomAccessGenotypeData.getSeqNames().iterator(); seqGeneticVariants = randomAccessGenotypeData.getSequenceGeneticVariants(seqNames.next()).iterator(); this.randomAccessGenotypeData = randomAccessGenotypeData; } @Override - public boolean hasNext() - { + public boolean hasNext() { return seqGeneticVariants.hasNext() || seqNames.hasNext(); } @Override - public GeneticVariant next() - { - if (seqGeneticVariants.hasNext()) - { + public GeneticVariant next() { + if (seqGeneticVariants.hasNext()) { return seqGeneticVariants.next(); } - if (seqNames.hasNext()) - { + if (seqNames.hasNext()) { seqGeneticVariants = randomAccessGenotypeData.getSequenceGeneticVariants(seqNames.next()).iterator(); return seqGeneticVariants.next(); } @@ -103,12 +103,8 @@ public GeneticVariant next() } @Override - public void remove() - { + public void remove() { throw new UnsupportedOperationException(); } - - - } } diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeData.java b/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeData.java index 68309c1fe..2cdccd06f 100644 --- a/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeData.java +++ b/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeData.java @@ -81,6 +81,11 @@ public interface RandomAccessGenotypeData extends GenotypeData { */ HashMap getVariantIdMap(VariantFilter filter); + /** + * Variant ID map without filter is saved as cache, use this function to clear this cache. + */ + void clearVariantIdMap(); + /** * Get a HashMap with the variants that have a primairy ID. * diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeDataReaderFormats.java b/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeDataReaderFormats.java index 37c1ef10d..bafbfd8bb 100644 --- a/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeDataReaderFormats.java +++ b/Genotype-IO/src/main/java/org/molgenis/genotype/RandomAccessGenotypeDataReaderFormats.java @@ -310,6 +310,10 @@ public static RandomAccessGenotypeDataReaderFormats valueOfSmart(String value){ return PLINK_BED; } else if (value.equals("B_PLINK")){ return PLINK_BED; + } else if (value.equals("PLINKB")){ + return PLINK_BED; + } else if (value.equals("PLINK_B")){ + return PLINK_BED; } return RandomAccessGenotypeDataReaderFormats.valueOf(value); diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/plink/BedBimFamGenotypeWriter.java b/Genotype-IO/src/main/java/org/molgenis/genotype/plink/BedBimFamGenotypeWriter.java index a79b9836d..963a9dd2f 100644 --- a/Genotype-IO/src/main/java/org/molgenis/genotype/plink/BedBimFamGenotypeWriter.java +++ b/Genotype-IO/src/main/java/org/molgenis/genotype/plink/BedBimFamGenotypeWriter.java @@ -143,7 +143,7 @@ private void writeBimBedFile(File bimFile, File bedFile) throws IOException { continue; } - bimFileWriter.append(variant.getSequenceName()); + bimFileWriter.append(FormatPlinkChr.formatChr(variant.getSequenceName())); bimFileWriter.append(SEPARATOR); bimFileWriter.append(variant.getPrimaryVariantId() == null ? variant.getSequenceName() + ":" + variant.getStartPos() : variant.getPrimaryVariantId()); bimFileWriter.append(SEPARATOR); diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/plink/FormatPlinkChr.java b/Genotype-IO/src/main/java/org/molgenis/genotype/plink/FormatPlinkChr.java new file mode 100644 index 000000000..92bcbbe70 --- /dev/null +++ b/Genotype-IO/src/main/java/org/molgenis/genotype/plink/FormatPlinkChr.java @@ -0,0 +1,31 @@ +package org.molgenis.genotype.plink; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * + * @author Patrick Deelen + */ +public class FormatPlinkChr { + + private static final Pattern CHR_PATTERN = Pattern.compile("^chr(.*)$", Pattern.CASE_INSENSITIVE); + + public static String formatChr(String chrName){ + + Matcher chrMatcher = CHR_PATTERN.matcher(chrName); + if (chrMatcher.find()) { + chrName = chrMatcher.group(1); + } + + switch(chrName){ + case "X": return "23"; + case "Y": return "24"; + case "XY": return "25"; + case "MT": return "26"; + default: return chrName; + } + + } + +} diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/plink/PedMapGenotypeWriter.java b/Genotype-IO/src/main/java/org/molgenis/genotype/plink/PedMapGenotypeWriter.java index d6c091cb7..d5ec45c7b 100644 --- a/Genotype-IO/src/main/java/org/molgenis/genotype/plink/PedMapGenotypeWriter.java +++ b/Genotype-IO/src/main/java/org/molgenis/genotype/plink/PedMapGenotypeWriter.java @@ -76,7 +76,7 @@ private void writeMapFile(File mapFile) throws IOException { continue; } - mapFileWriter.append(variant.getSequenceName()); + mapFileWriter.append(FormatPlinkChr.formatChr(variant.getSequenceName())); mapFileWriter.append(SEPARATOR); mapFileWriter.append(variant.getPrimaryVariantId() == null ? variant.getSequenceName() + ":" + variant.getStartPos() : variant.getPrimaryVariantId()); mapFileWriter.append(SEPARATOR); diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/table/TableGenotypeWriter.java b/Genotype-IO/src/main/java/org/molgenis/genotype/table/TableGenotypeWriter.java index bd99b0e99..ce13c2c99 100644 --- a/Genotype-IO/src/main/java/org/molgenis/genotype/table/TableGenotypeWriter.java +++ b/Genotype-IO/src/main/java/org/molgenis/genotype/table/TableGenotypeWriter.java @@ -42,8 +42,13 @@ public void write(String path) { for (GeneticVariant variant : genotypeData) { - dosageWriter.append(variant.getPrimaryVariantId()); - genotypeWriter.append(variant.getPrimaryVariantId()); + String variantId = variant.getPrimaryVariantId(); + if(variantId == null){ + variantId = variant.getSequenceName() + ":" + variant.getStartPos(); + } + + dosageWriter.append(variantId); + genotypeWriter.append(variantId); for (float dosage : variant.getSampleDosages()) { dosageWriter.append('\t'); diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeData.java b/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeData.java index 4ce5477c7..00178a2e9 100644 --- a/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeData.java +++ b/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeData.java @@ -120,7 +120,7 @@ public TriTyperGenotypeData(File location, int cacheSize, VariantFilter variantF } public TriTyperGenotypeData(File location, int cacheSize, VariantFilter variantFilter, SampleFilter sampleFilter) throws IOException { - this(new File(location, "GenotypeMatrix.dat"), new File(location, "ImputedDosageMatrix.dat").exists() ? new File(location, "ImputedDosageMatrix.dat") : null, new File(location, "SNPs.txt.gz").exists() ? new File(location, "SNPs.txt.gz") : new File(location, "SNPs.txt"), new File(location, "SNPMappings.txt.gz").exists() ? new File(location, "SNPMappings.txt.gz") : new File(location, "SNPMappings.txt"), new File(location, "Individuals.txt.gz").exists() ? new File(location, "Individuals.txt.gz") : new File(location, "Individuals.txt"), new File(location, "PhenotypeInformation.txt.gz").exists() ? new File(location, "PhenotypeInformation.txt.gz") : new File(location, "PhenotypeInformation.txt"), cacheSize, variantFilter, sampleFilter, new File(location, "allelRecodingInformation.txt").exists() ? new File(location, "allelRecodingInformation.txt") : null); + this(new File(location, "GenotypeMatrix.dat"), new File(location, "ImputedDosageMatrix.dat").exists() ? new File(location, "ImputedDosageMatrix.dat") : null, new File(location, "SNPs.txt.gz").exists() ? new File(location, "SNPs.txt.gz") : new File(location, "SNPs.txt"), new File(location, "SNPMappings.txt.gz").exists() ? new File(location, "SNPMappings.txt.gz") : new File(location, "SNPMappings.txt"), new File(location, "Individuals.txt.gz").exists() ? new File(location, "Individuals.txt.gz") : new File(location, "Individuals.txt"), new File(location, "PhenotypeInformation.txt.gz").exists() ? new File(location, "PhenotypeInformation.txt.gz") : new File(location, "PhenotypeInformation.txt"), cacheSize, variantFilter, sampleFilter, new File(location, "AlleleRecodingInformation.txt").exists() ? new File(location, "AlleleRecodingInformation.txt") : null); } public TriTyperGenotypeData(File genotypeDataFile, File imputedDosageDataFile, File snpFile, File snpMapFile, File individualFile, File phenotypeAnnotationFile, int cacheSize, VariantFilter variantFilter, SampleFilter sampleFilter, File allelRecoding) throws IOException { @@ -345,6 +345,9 @@ private void loadSNPAnnotation(GeneticVariantRange.GeneticVariantRangeCreate snp String line; while ((line = snpFileReader.readLine()) != null) { + if(allSNPHash.contains(line)){ + throw new GenotypeDataException("SNP found twice: " + line + ". All SNP ID's must be unique"); + } if (variantFilter == null || variantFilter.doesIdPassFilter(line)) { allSNPHash.put(line, unfilteredSnpCount); } @@ -459,11 +462,17 @@ public List getSampleVariants(GeneticVariant variant) { try { genotypeHandle.seek(indexLong); if (genotypeHandle.read(buffer) != buffer.length) { + + LOG.fatal("ERROR loading trityper SNP: " + variant.getPrimaryVariantId() + " at: " + variant.getSequenceName() + ":" + variant.getStartPos() + " variant index: " + index); + throw new GenotypeDataException("Could not read bytes from: " + indexLong + " in genotype file " + genotypeDataFile.getAbsolutePath() + " (size: " + genotypeDataFile.length() + ")"); } } catch (IOException e) { - throw new GenotypeDataException("Could not read bytes from: " + indexLong + " in genotype file " + genotypeDataFile.getAbsolutePath() + " (size: " + genotypeDataFile.length() + ")"); + + LOG.fatal("ERROR loading trityper SNP: " + variant.getPrimaryVariantId() + " at: " + variant.getSequenceName() + ":" + variant.getStartPos() + " variant index: " + index); + + throw new GenotypeDataException("Could not read bytes from: " + indexLong + " in genotype file " + genotypeDataFile.getAbsolutePath() + " (size: " + genotypeDataFile.length() + ")", e); } List alleles = new ArrayList(includedSamples.size()); diff --git a/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeWriter.java b/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeWriter.java index a8fa0905c..f52fabcc6 100644 --- a/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeWriter.java +++ b/Genotype-IO/src/main/java/org/molgenis/genotype/trityper/TriTyperGenotypeWriter.java @@ -15,6 +15,7 @@ import org.molgenis.genotype.Sample; import org.molgenis.genotype.variant.GeneticVariant; import org.molgenis.genotype.variant.NotASnpException; +import org.molgenis.genotype.variant.id.GeneticVariantId; /** * @@ -48,7 +49,7 @@ public void write(File folder) throws IOException { File snpMapFile = new File(folder, "SNPMappings.txt"); File individualFile = new File(folder, "Individuals.txt"); File phenotypeAnnotationFile = new File(folder, "PhenotypeInformation.txt"); - File allelRecodingFile = new File(folder, "allelRecodingInformation.txt"); + File allelRecodingFile = new File(folder, "AlleleRecodingInformation.txt"); writeSnps(snpFile, snpMapFile); writeSamples(individualFile, phenotypeAnnotationFile); @@ -70,14 +71,16 @@ private void writeSnps(File snpFile, File snpMapFile) throws IOException { // continue; // } - snpFileWriter.append(variant.getPrimaryVariantId()); + final String snpName = createTriTyperVariantId(variant); + + snpFileWriter.append(snpName); snpFileWriter.append('\n'); snpMapFileWriter.append(variant.getSequenceName()); snpMapFileWriter.append('\t'); snpMapFileWriter.append(String.valueOf(variant.getStartPos())); snpMapFileWriter.append('\t'); - snpMapFileWriter.append(variant.getPrimaryVariantId()); + snpMapFileWriter.append(snpName); snpMapFileWriter.append('\n'); } @@ -147,7 +150,7 @@ private void writeGenotypes(File genotypeDataFile, File imputedDosageDataFile, F a = sampleAlleles.get(0).isSnpAllele() && sampleAlleles.get(0) != Allele.ZERO ? (byte) sampleAlleles.get(0).getAlleleAsSnp() : 0; b = sampleAlleles.get(1).isSnpAllele() && sampleAlleles.get(1) != Allele.ZERO ? (byte) sampleAlleles.get(1).getAlleleAsSnp() : 0; } else { - snpRecodingInfo.add(variant.getPrimaryVariantId()+"\t"+variant.getSequenceName()+"\t"+variant.getStartPos()+"\t"+variant.getVariantAlleles().get(0)+"\t"+variant.getVariantAlleles().get(1)); + snpRecodingInfo.add(createTriTyperVariantId(variant)+"\t"+variant.getSequenceName()+"\t"+variant.getStartPos()+"\t"+variant.getVariantAlleles().get(0)+"\t"+variant.getVariantAlleles().get(1)); if(sampleAlleles.get(0).equals(variant.getVariantAlleles().get(0))){ a = (byte) 'A'; @@ -196,7 +199,7 @@ private void writeGenotypes(File genotypeDataFile, File imputedDosageDataFile, F if(!snpRecodingInfo.isEmpty()){ BufferedWriter allelRecodingFileWriter = new BufferedWriter(new FileWriter(allelRecodingFile)); - allelRecodingFileWriter.write("Variant_ID\tchr\tpos\tAllel1\tAllel2\n"); + allelRecodingFileWriter.write("Variant_ID\tChr\tPos\tAllele1\tAllele2\n"); for(String s : snpRecodingInfo){ allelRecodingFileWriter.write(s+"\n"); } @@ -204,4 +207,21 @@ private void writeGenotypes(File genotypeDataFile, File imputedDosageDataFile, F allelRecodingFileWriter.close(); } } + + private String createTriTyperVariantId(GeneticVariant variant) { + final GeneticVariantId snpId = variant.getVariantId(); + String snpName; + if(snpId.containsId()){ + snpName = snpId.getPrimairyId(); + } else { + snpName = variant.getSequenceName() + ':' + String.valueOf(variant.getStartPos()); + if(!variant.isSnp()){ + for(Allele allele : variant.getVariantAlleles()){ + snpName = snpName + "_" + allele.getAlleleAsString(); + } + + } + } + return snpName; + } } diff --git a/cellTypeSpecificAlleleSpecificExpression/README.md b/cellTypeSpecificAlleleSpecificExpression/README.md index 297b20d61..51fa77d78 100644 --- a/cellTypeSpecificAlleleSpecificExpression/README.md +++ b/cellTypeSpecificAlleleSpecificExpression/README.md @@ -1,3 +1 @@ -# Cell type specific Allele specific Expression - -please see the [wiki](https://github.com/adriaan-vd-graaf/systemsgenetics/wiki/ASE) for full documentation on usage and internal mechanisms. \ No newline at end of file +Please see the [wiki](https://github.com/adriaan-vd-graaf/systemsgenetics/wiki/ASE) for full documentation on usage and internal mechanisms. diff --git a/eQTLInteractionAnalyser/pom.xml b/eQTLInteractionAnalyser/pom.xml new file mode 100644 index 000000000..b2f231ebb --- /dev/null +++ b/eQTLInteractionAnalyser/pom.xml @@ -0,0 +1,92 @@ + + + 4.0.0 + + nl.systemsgenetics + systemsgenetics + 1.0.2-SNAPSHOT + + nl.systemsgenetics + eQTLInteractionAnalyser + 1.1-SNAPSHOT + eQTLInteractionAnalyser + http://maven.apache.org + + UTF-8 + + + + net.sf.jsci + jsci + 1.2 + + + org.apache.commons + commons-math3 + 3.2 + + + net.sourceforge.parallelcolt + parallelcolt + 0.10.0 + + + gov.nist.math.jama + gov.nist.math.jama + 1.1.1 + + + junit + junit + 3.8.1 + test + + + nl.systemsgenetics + genetica-libraries + 1.0.7-SNAPSHOT + + + com.opencsv + opencsv + 3.4 + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.3.2 + + UTF-8 + + + + maven-assembly-plugin + + + jar-with-dependencies + + + + nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser.EQTLInteractionAnalyser + true + true + + + + + + package + + single + + + + + + + diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/CompareToGeuvadis.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/CompareToGeuvadis.java new file mode 100644 index 000000000..799d77c2f --- /dev/null +++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/CompareToGeuvadis.java @@ -0,0 +1,79 @@ +package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser; + +import java.util.HashSet; +import java.util.Map; + +/** + * + * @author Patrick Deelen + */ +public class CompareToGeuvadis { + + /** + * @param args the command line arguments + */ + public static void main(String[] args) { + + ExpressionDataset bios = new ExpressionDataset("/Volumes/Promise_RAID/lude/InteractionZScoresMatrix-4Covariates.txt.binary"); + ExpressionDataset geuvadis = new ExpressionDataset("/Volumes/Promise_RAID/projects/BBMRI/interactionsGeuvadisRegressOut/InteractionZScoresMatrix-9Covariates.txt.binary"); + + HashSet covariatesReplicated = new HashSet(); + HashSet genesReplicated = new HashSet(); + int interactionsReplicated = 0; + int sameDirection = 0; + int oppositeDirection = 0; + + for (Map.Entry covariateEntry : bios.hashProbes.entrySet()) { + for (Map.Entry eQtlGeneEntry : bios.hashSamples.entrySet()) { + + String covariate = covariateEntry.getKey(); + String eQtlGene = eQtlGeneEntry.getKey(); + +// if(!covariate.equals("ENSG00000084072")){ +// continue; +// } + + double biosInteractionZ = bios.rawData[covariateEntry.getValue()][eQtlGeneEntry.getValue()]; + + if (biosInteractionZ >= 6 || biosInteractionZ <= -6) { + + Integer geuvadisCovI = geuvadis.hashProbes.get(covariate); + Integer geuvadisGenI = geuvadis.hashSamples.get(eQtlGene); + + if (geuvadisCovI != null && geuvadisGenI != null) { + + double geuvadisInteractionZ = geuvadis.rawData[geuvadisCovI][geuvadisGenI]; + + if (geuvadisInteractionZ >= 5 || geuvadisInteractionZ <= -5) { + + covariatesReplicated.add(covariate); + genesReplicated.add(eQtlGene); + interactionsReplicated++; + + if(biosInteractionZ * geuvadisInteractionZ > 0){ + sameDirection++; + } else { + oppositeDirection++; + } + + System.out.println(covariate + "\t" + eQtlGene + "\t" + biosInteractionZ + "\t" + geuvadisInteractionZ); + + } + + } + + + } + + } + } + + System.out.println("Covariates replicated: " + covariatesReplicated.size()); + System.out.println("Genes replicated: " + genesReplicated.size()); + System.out.println("Interactions replicated: " + interactionsReplicated); + System.out.println("Interactions replicated same: " + sameDirection); + System.out.println("Interactions replicated opposite: " + oppositeDirection); + + } + +} diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/DoubleArrayIntegerObject.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/DoubleArrayIntegerObject.java new file mode 100644 index 000000000..6438d9215 --- /dev/null +++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/DoubleArrayIntegerObject.java @@ -0,0 +1,21 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser; + +/** + * + * @author ludefranke + */ +public class DoubleArrayIntegerObject { + + public double[] doubleArray; + public int intValue; + public DoubleArrayIntegerObject(double[] doubleArray, int intValue) { + this.doubleArray = doubleArray; + this.intValue = intValue; + } + +} diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/DoubleArrayIntegerObjectSorter.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/DoubleArrayIntegerObjectSorter.java new file mode 100644 index 000000000..3e639d144 --- /dev/null +++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/DoubleArrayIntegerObjectSorter.java @@ -0,0 +1,29 @@ +/* + * GeneLocationObjectSorter.java + * + * Created on 23 December 2003, 17:14 + */ + +package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser; + +/** + * + * @author Like + */ +public class DoubleArrayIntegerObjectSorter extends VectorSorter { + + /** Creates a new instance of GeneLocationObjectSorter */ + public DoubleArrayIntegerObjectSorter() { + super(); + } + + /** Override object comparer + * @param a the first GeneLocationObject to be compared + * @param b the second GeneLocationObject to be compared + * @return true if the first GeneLocationObject.getChrStart() is lower than the second one + */ + protected boolean lt (Object a, Object b) { + return (((DoubleArrayIntegerObject)a).intValue < ((DoubleArrayIntegerObject)b).intValue); + } + +} \ No newline at end of file diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/EQTLInteractionAnalyser.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/EQTLInteractionAnalyser.java new file mode 100644 index 000000000..aa7e62ad3 --- /dev/null +++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/EQTLInteractionAnalyser.java @@ -0,0 +1,352 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser; + +import java.io.*; + +import org.apache.commons.cli.*; +import umcg.genetica.io.text.TextFile; + +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.HashMap; + +/** + * + * @author lude + */ +public class EQTLInteractionAnalyser { + + private static final DateFormat DATE_TIME_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + private static final Date currentDataTime = new Date(); + private static final Options OPTIONS; + + static { + + OPTIONS = new Options(); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArgs(); + OptionBuilder.withDescription("Path to the folder containing expression and genotype data"); + OptionBuilder.withLongOpt("input"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("i")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Path to the output folder"); + OptionBuilder.withLongOpt("output"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("o")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Path to the eQTL file to test for interactions"); + OptionBuilder.withLongOpt("eqtls"); + OPTIONS.addOption(OptionBuilder.create("e")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Path to the eQTL file to correct covariates"); + OptionBuilder.withLongOpt("eqtlsCovariates"); + OPTIONS.addOption(OptionBuilder.create("ec")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Path to the gene annotation file in the format of eQTL mapping pipeline"); + OptionBuilder.withLongOpt("annot"); + OPTIONS.addOption(OptionBuilder.create("a")); + + OptionBuilder.withArgName("int"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Maximum number of covariates to regress out"); + OptionBuilder.withLongOpt("maxcov"); + OPTIONS.addOption(OptionBuilder.create("n")); + + OptionBuilder.withDescription("Interpret the z-score matrices"); + OptionBuilder.withLongOpt("interpret"); + OPTIONS.addOption(OptionBuilder.create("it")); + + OptionBuilder.withDescription("Run permutation"); + OptionBuilder.withLongOpt("permute"); + OPTIONS.addOption(OptionBuilder.create("perm")); + + OptionBuilder.withDescription("Find chi2sum differences for each covariate between 2 consequtive interaction runs"); + OptionBuilder.withLongOpt("chi2sumDiff"); + OPTIONS.addOption(OptionBuilder.create("dif")); + + OptionBuilder.withArgName("int"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Start round for chi2sumDiff option"); + OptionBuilder.withLongOpt("start"); + OPTIONS.addOption(OptionBuilder.create("s")); + + OptionBuilder.withDescription("Preprocess the data"); + OptionBuilder.withLongOpt("preprocess"); + OPTIONS.addOption(OptionBuilder.create("p")); + + OptionBuilder.withDescription("Convert matrix"); + OptionBuilder.withLongOpt("convertMatrix"); + OPTIONS.addOption(OptionBuilder.create("cm")); + + OptionBuilder.withDescription("Skip all normalization step. n must be 1"); + OptionBuilder.withLongOpt("noNormalization"); + OPTIONS.addOption(OptionBuilder.create("nn")); + + OptionBuilder.withDescription("Skip covariate normalization step. n must be 1"); + OptionBuilder.withLongOpt("noCovNormalization"); + OPTIONS.addOption(OptionBuilder.create("ncn")); + + OptionBuilder.withArgName("strings"); + OptionBuilder.hasArgs(); + OptionBuilder.withDescription("covariates to correct for using an interaction term before running the interaction analysis"); + OptionBuilder.withLongOpt("cov"); + OPTIONS.addOption(OptionBuilder.create("c")); + + OptionBuilder.withArgName("strings"); + OptionBuilder.hasArgs(); + OptionBuilder.withDescription("Covariates to correct for without interaction term before running the interaction analysis"); + OptionBuilder.withLongOpt("cov2"); + OPTIONS.addOption(OptionBuilder.create("c2")); + + OptionBuilder.withArgName("strings"); + OptionBuilder.hasArgs(); + OptionBuilder.withDescription("Covariates to correct for without interaction term before running the interaction analysis"); + OptionBuilder.withLongOpt("cohorts"); + OPTIONS.addOption(OptionBuilder.create("ch")); + + OptionBuilder.withArgName("strings"); + OptionBuilder.hasArgs(); + OptionBuilder.withDescription("Covariates to to test in interaction analysis. Optional, all are tested if not used"); + OptionBuilder.withLongOpt("covTest"); + OPTIONS.addOption(OptionBuilder.create("ct")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("File containing the covariates to correct for using an interaction term before running the interaction analysis. No header, each covariate on a separate line"); + OptionBuilder.withLongOpt("covFile"); + OPTIONS.addOption(OptionBuilder.create("cf")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("File containing the SNPs to swap"); + OptionBuilder.withLongOpt("swap"); + OPTIONS.addOption(OptionBuilder.create("sw")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Included samples"); + OptionBuilder.withLongOpt("includedSamples"); + OPTIONS.addOption(OptionBuilder.create("is")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Gene annotation file"); + OptionBuilder.withLongOpt("geneAnnotation"); + OPTIONS.addOption(OptionBuilder.create("ga")); + + OptionBuilder.withArgName("int"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Number of threads"); + OptionBuilder.withLongOpt("threads"); + OPTIONS.addOption(OptionBuilder.create("nt")); + + OptionBuilder.withArgName("int"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Z-score difference threshold for interpretation"); + OptionBuilder.withLongOpt("threshold"); + OPTIONS.addOption(OptionBuilder.create("thr")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("SNPs to test"); + OptionBuilder.withLongOpt("snpsToTest"); + OPTIONS.addOption(OptionBuilder.create("snps")); + } + + public static void main(String[] args) throws IOException, Exception { + System.out.println("Starting interaction analysis"); + System.out.println("Current date and time: " + DATE_TIME_FORMAT.format(currentDataTime)); + System.out.println(); + + String inputDir, outputDir, eqtlFile = null, annotationFile = null; + final File snpsToSwapFile; + int maxNumCovariatesToRegress = 20; + int numThreads; + final boolean interpret, chi2sumDiff, permute, preproces; + final int startRoundCompareChi2, threshold; + + HashMap hashSamples; + + final String[] covariates; + final String[] covariates2; + final String[] cohorts; + final String[] covariatesToTest; + final File ensgAnnotationFile; + final File snpsToTestFile; + final boolean skipNormalization; + final boolean skipCovariateNormalization; + final boolean convertMatrix; + final String eqtlFileCovariates; + + try { + final CommandLine commandLine = new PosixParser().parse(OPTIONS, args, false); + + inputDir = commandLine.getOptionValue("i"); + outputDir = commandLine.getOptionValue("o"); + + if (commandLine.hasOption('e')) { + eqtlFile = commandLine.getOptionValue("e"); + } + + + eqtlFileCovariates = commandLine.getOptionValue("ec", null); + + if (commandLine.hasOption('n')) { + maxNumCovariatesToRegress = Integer.parseInt(commandLine.getOptionValue("n")); + } + if (commandLine.hasOption("thr")) { + threshold = Integer.parseInt(commandLine.getOptionValue("thr")); + } + else { + threshold = 3; + } + + + interpret = commandLine.hasOption("it"); + chi2sumDiff = commandLine.hasOption("dif"); + permute = commandLine.hasOption("perm"); + preproces = commandLine.hasOption("p"); + convertMatrix = commandLine.hasOption("cm"); + + if (commandLine.hasOption('s')) { + startRoundCompareChi2 = Integer.parseInt(commandLine.getOptionValue("s")); + } else if(chi2sumDiff){ + throw new Exception("Set -s"); + } else { + startRoundCompareChi2 = 0; + } + + if (commandLine.hasOption('a')) { + annotationFile = commandLine.getOptionValue("a"); + } + + if (commandLine.hasOption("cf")) { + TextFile covFile = new TextFile(commandLine.getOptionValue("cf"), false); + covariates = covFile.readAsArray(); + covFile.close(); + } + else if (commandLine.hasOption("c")){ + covariates = commandLine.getOptionValues("c"); + } else { + covariates = new String[0]; + } + + if (commandLine.hasOption("c2")){ + covariates2 = commandLine.getOptionValues("c2"); + } else { + covariates2 = new String[0]; + } + + if (commandLine.hasOption("ch")){ + cohorts = commandLine.getOptionValues("ch"); + } else { + cohorts = null; + } + + if (commandLine.hasOption("ct")){ + covariatesToTest = commandLine.getOptionValues("ct"); + } else { + covariatesToTest = null; + } + + if (commandLine.hasOption("sw")){ + snpsToSwapFile = new File(commandLine.getOptionValue("sw")); + } else { + snpsToSwapFile = null; + } + + if (commandLine.hasOption("snps")){ + snpsToTestFile = new File(commandLine.getOptionValue("snps")); + } else { + snpsToTestFile = null; + } + + skipNormalization = commandLine.hasOption("nn"); + if(skipNormalization && maxNumCovariatesToRegress != 1){ + System.err.println("n must be one if normalization is turned off"); + System.exit(-1); + } + + skipCovariateNormalization = commandLine.hasOption("ncn"); + if(skipCovariateNormalization && maxNumCovariatesToRegress != 1){ + System.err.println("n must be one if covariate normalization is turned off"); + System.exit(-1); + } + + if (commandLine.hasOption("is")){ + File samplesToIncludeFile = new File(commandLine.getOptionValue("is")); + System.out.println("Samples to include file: " + samplesToIncludeFile.getAbsolutePath()); + hashSamples = new HashMap(); + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(samplesToIncludeFile), "UTF-8")); + String line; + while ((line = reader.readLine()) != null) { + hashSamples.put(line, null); + hashSamples.put(line + "_exp", null); + hashSamples.put(line + "_dosage", null); + } + } else { + hashSamples = null; + } + + + if (commandLine.hasOption("ga")){ + ensgAnnotationFile = new File(commandLine.getOptionValue("ga")); + } else { + ensgAnnotationFile = null; + } + if (commandLine.hasOption("nt")) { + numThreads = Integer.parseInt(commandLine.getOptionValue("nt")); + } else { + numThreads = Runtime.getRuntime().availableProcessors(); + } + + } catch (ParseException ex) { + System.err.println("Invalid command line arguments: "); + System.err.println(ex.getMessage()); + System.err.println(); + new HelpFormatter().printHelp(" ", OPTIONS); + System.exit(1); + return; + } + + if(preproces){ + TestEQTLDatasetForInteractions interactor = new TestEQTLDatasetForInteractions(inputDir, outputDir); + interactor.preprocessData(); + } else if (interpret){ + TestEQTLDatasetForInteractions interactor = new TestEQTLDatasetForInteractions(inputDir, outputDir); + interactor.interpretInteractionZScoreMatrix(maxNumCovariatesToRegress, startRoundCompareChi2, threshold); + } + else if (chi2sumDiff){ + TestEQTLDatasetForInteractions interactor = new TestEQTLDatasetForInteractions(inputDir, outputDir); + interactor.findChi2SumDifferences(maxNumCovariatesToRegress, startRoundCompareChi2, ensgAnnotationFile); + } else if (convertMatrix){ + System.out.println("input file: " + inputDir); + System.out.println("output file: " + outputDir); + if(inputDir.equals(outputDir)){ + System.err.println("input == output"); + System.exit(1); + } + new ExpressionDataset(inputDir).save(outputDir); + } + else { + new TestEQTLDatasetForInteractions(inputDir, outputDir, eqtlFile, maxNumCovariatesToRegress, annotationFile, covariates, covariates2, snpsToSwapFile, permute, covariatesToTest, hashSamples, numThreads, cohorts, snpsToTestFile, skipNormalization, skipCovariateNormalization, eqtlFileCovariates); + } + } + +} diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/ExpressionDataset.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/ExpressionDataset.java new file mode 100644 index 000000000..b446ebe19 --- /dev/null +++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/ExpressionDataset.java @@ -0,0 +1,583 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser; + +import java.io.*; +import java.net.*; +import java.util.*; +import java.awt.image.BufferedImage; +import java.awt.image.*; +import java.awt.*; +import java.awt.geom.*; +import java.lang.Math; +import javax.imageio.*; +import org.apache.commons.lang3.StringUtils; + +/** + * + * @author lude + */ +public class ExpressionDataset { + + public double[][] rawData = null; + public int nrSamples = 0; + public int nrProbes = 0; + public String[] probeNames = null; + public String[] sampleNames = null; + public HashMap hashSamples = new HashMap(); + public HashMap hashProbes = new HashMap(); + private HashMap hashProbesToInclude = null; + private HashMap hashSamplesToInclude = null; + public String fileName = null; + + public ExpressionDataset(String fileName) { + if (fileName.endsWith(".binary")) { + loadExpressionDataInBinaryFormat(fileName); + } else { + loadExpressionData(fileName, '\t'); + } + } + + public ExpressionDataset(String fileName, char delimiter) { + if (fileName.endsWith(".binary")) { + loadExpressionDataInBinaryFormat(fileName); + } else { + loadExpressionData(fileName, delimiter); + } + } + + public ExpressionDataset(String fileName, char delimiter, HashMap hashProbesToInclude) { + this.hashProbesToInclude = hashProbesToInclude; + if (fileName.endsWith(".binary")) { + loadExpressionDataInBinaryFormat(fileName); + } else { + loadExpressionData(fileName, delimiter); + } + } + + public ExpressionDataset(String fileName, char delimiter, HashMap hashProbesToInclude, HashMap hashSamplesToInclude) { + this.hashProbesToInclude = hashProbesToInclude; + this.hashSamplesToInclude = hashSamplesToInclude; + if (fileName.endsWith(".binary")) { + loadExpressionDataInBinaryFormat(fileName); + } else { + loadExpressionData(fileName, delimiter); + } + } + + public ExpressionDataset(int nrProbes, int nrSamples) { + this.nrProbes = nrProbes; + this.nrSamples = nrSamples; + sampleNames = new String[nrSamples]; + for (int s=0; s2 && data[1].length() > 0 && data[1].equals("MultipleHits")) { + dataIsInTriTyperFormat = true; + sampleOffset = 9; + + } + + if (hashSamplesToInclude==null) { + nrSamples = data.length - sampleOffset; + sampleNames = new String[nrSamples]; + sampleIndex = new int[nrSamples]; + for (int s=0; s> 56); + buffer[bufferLoc + 1] = (byte) (bits >> 48 & 0xff); + buffer[bufferLoc + 2] = (byte) (bits >> 40 & 0xff); + buffer[bufferLoc + 3] = (byte) (bits >> 32 & 0xff); + buffer[bufferLoc + 4] = (byte) (bits >> 24 & 0xff); + buffer[bufferLoc + 5] = (byte) (bits >> 16 & 0xff); + buffer[bufferLoc + 6] = (byte) (bits >> 8 & 0xff); + buffer[bufferLoc + 7] = (byte) (bits & 0xff); + bufferLoc += 8; + } + try { + out.write(buffer); + } catch (IOException e) { + System.err.println("Can't write to " + fileBinary.getName() + ": " + e.getMessage()); + System.exit(1); + } + } + try { + out.close(); + } catch (IOException e) { + e.printStackTrace(); + } + File fileProbes = new File(fileName + ".rows.txt"); + try { + java.io.BufferedWriter outProbes = new java.io.BufferedWriter(new java.io.FileWriter(fileProbes)); + for (int p=0; p>> 24), + (byte) (value >>> 16), + (byte) (value >>> 8), + (byte) value}; + } + + private int byteArrayToInt(byte[] b) { + return (b[0] << 24) + + ((b[1] & 0xff) << 16) + + ((b[2] & 0xff) << 8) + + (b[3] & 0xff); + } + + +} diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/GeneAnnotation.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/GeneAnnotation.java new file mode 100644 index 000000000..d38d252d9 --- /dev/null +++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/GeneAnnotation.java @@ -0,0 +1,43 @@ +package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser; + +/** + * + * @author Patrick Deelen + */ +public class GeneAnnotation { + + private final String ensg; + private final String huho; + private final String chr; + private final int start; + private final int end; + + public GeneAnnotation(String ensg, String huho, String chr, int start, int end) { + this.ensg = ensg; + this.huho = huho; + this.chr = chr; + this.start = start; + this.end = end; + } + + public String getEnsg() { + return ensg; + } + + public String getHuho() { + return huho; + } + + public String getChr() { + return chr; + } + + public int getStart() { + return start; + } + + public int getEnd() { + return end; + } + +} diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/InteractionPlotter.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/InteractionPlotter.java new file mode 100644 index 000000000..950953e8a --- /dev/null +++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/InteractionPlotter.java @@ -0,0 +1,574 @@ +package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser; + +import au.com.bytecode.opencsv.CSVReader; +import au.com.bytecode.opencsv.CSVWriter; +import java.awt.Color; +import java.awt.Graphics2D; +import java.awt.RenderingHints; +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import static nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser.TestEQTLDatasetForInteractions.getEqtls; +import static nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser.TestEQTLDatasetForInteractions.getLinearRegressionCoefficients; +import static nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser.TestEQTLDatasetForInteractions.orthogonalizeDataset; +import org.apache.commons.math3.stat.ranking.NaturalRanking; + +/** + * + * @author Patrick Deelen + */ +public class InteractionPlotter { + + static String inputDir = null; + static String outputDir = null; + + /** + * @param args the command line arguments + */ + public static void main(String[] args) throws IOException { + + //makeInteractionPlot("D:\\tmp\\test.png", new double[]{0,0,0,0.2,1,1,1,1,2,2,2}, new double[]{5,4,3,0.2,8,12,6,7,23,5,7}, new double[]{3,2,1,0.2,2,6,4,6,20,2,5}); + + inputDir = args[0]; + outputDir = args[1]; + String eQTLfileName = args[2]; + String covariate = args[3]; + File genesFile = new File(args[4]); + + + + System.out.println("Input dir: " + inputDir); + System.out.println("Output dir: " + outputDir); + System.out.println("eQTL file: " + eQTLfileName); + System.out.println("covariate: " + covariate); + System.out.println("genes file: " + genesFile.getAbsolutePath()); + + String[] covsToCorrect = {"gender", "GC", "MEDIAN_5PRIME_BIAS", "MEDIAN_3PRIME_BIAS", "CEU", "GBR", "FIN", "TSI", "YRI"}; + //String[] covsToCorrect = {"age", "gender", "GC", "MEDIAN_5PRIME_BIAS", "MEDIAN_3PRIME_BIAS", "LLdeep", "LLS", "RS", "CODAM"}; + HashMap hashEQTLs = getEqtls(eQTLfileName); + + HashMap hashSamples = new HashMap(); + + if (1 == 1) { + + System.out.println("Removing outlier samples!!!"); + HashMap hashCovariates = new HashMap(); + hashCovariates.put("MEDIAN_5PRIME_BIAS", null); + hashCovariates.put("MEDIAN_3PRIME_BIAS", null); + ExpressionDataset datasetCovariates = new ExpressionDataset(inputDir + "/covariateTableLude.txt.Covariates.binary", '\t', hashCovariates, null); + hashSamples = new HashMap(); + for (int s = 0; s < datasetCovariates.nrSamples; s++) { + if (datasetCovariates.rawData[0][s] != 0) { + hashSamples.put(datasetCovariates.sampleNames[s], null); + } + } + datasetCovariates = new ExpressionDataset(inputDir + "/covariateTableLude.txt.Covariates.binary", '\t', hashCovariates, hashSamples); + HashMap hashSamplesToExclude = new HashMap(); + if (1 == 1) { + int index = ((Integer) datasetCovariates.hashProbes.get("MEDIAN_5PRIME_BIAS")).intValue(); + double mean = JSci.maths.ArrayMath.mean(datasetCovariates.rawData[index]); + double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[index]); + for (int s = 0; s < datasetCovariates.nrSamples; s++) { + double z = (datasetCovariates.rawData[index][s] - mean) / stdev; + if (Math.abs(z) > 3) { + hashSamplesToExclude.put(datasetCovariates.sampleNames[s], null); + } + } + } + if (1 == 1) { + int index = ((Integer) datasetCovariates.hashProbes.get("MEDIAN_3PRIME_BIAS")).intValue(); + double mean = JSci.maths.ArrayMath.mean(datasetCovariates.rawData[index]); + double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[index]); + for (int s = 0; s < datasetCovariates.nrSamples; s++) { + double z = (datasetCovariates.rawData[index][s] - mean) / stdev; + if (Math.abs(z) > 3) { + hashSamplesToExclude.put(datasetCovariates.sampleNames[s], null); + } + } + } + hashSamples = new HashMap(); + for (int s = 0; s < datasetCovariates.nrSamples; s++) { + if (!hashSamplesToExclude.containsKey(datasetCovariates.sampleNames[s])) { + hashSamples.put(datasetCovariates.sampleNames[s], null); + hashSamples.put(datasetCovariates.sampleNames[s] + "_exp", null); + hashSamples.put(datasetCovariates.sampleNames[s] + "_dosage", null); + } + } + } + + ExpressionDataset datasetGenotypes = new ExpressionDataset(inputDir + "/bigTableLude.txt.Genotypes.binary", '\t', hashEQTLs, hashSamples); + ExpressionDataset datasetExpression = new ExpressionDataset(inputDir + "/bigTableLude.txt.Expression.binary", '\t', hashEQTLs, hashSamples); + ExpressionDataset datasetCovariates = new ExpressionDataset(inputDir + "/covariateTableLude.txt.Covariates.binary", '\t', null, hashSamples); + + org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression regression = new org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression(); + int nrSamples = datasetGenotypes.nrSamples; + + + if (1 == 1) { + //Define a set of covariates that we want to use as correction: + System.out.println("Correcting gene expression data for cohort specific effects and top 25 components"); + //String[] cohorts = {"LLDeep", "LLS", "RS", "CODAM"}; + int nrCompsToCorrectFor = 25; + ExpressionDataset datasetCovariatesToCorrectFor = new ExpressionDataset(nrCompsToCorrectFor, datasetGenotypes.nrSamples); + datasetCovariatesToCorrectFor.sampleNames = datasetGenotypes.sampleNames; +// for (int p = 0; p < cohorts.length; p++) { +// for (int s = 0; s < datasetGenotypes.nrSamples; s++) { +// if (datasetGenotypes.sampleNames[s].startsWith(cohorts[p])) { +// datasetCovariatesToCorrectFor.rawData[p][s] = 1; +// } +// } +// } + if (nrCompsToCorrectFor > 0) { + for (int comp = 0; comp < nrCompsToCorrectFor; comp++) { + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetCovariatesToCorrectFor.rawData[comp][s] = datasetCovariates.rawData[datasetCovariates.nrProbes - 51 + comp][s]; + } + } + } + + datasetCovariatesToCorrectFor.transposeDataset(); + + datasetCovariatesToCorrectFor.save(inputDir + "/CovariatesToCorrectFor.txt"); + orthogonalizeDataset(inputDir + "/CovariatesToCorrectFor.txt"); + datasetCovariatesToCorrectFor = new ExpressionDataset(inputDir + "/CovariatesToCorrectFor.txt.PrincipalComponents.txt"); + datasetCovariatesToCorrectFor.transposeDataset(); + ExpressionDataset datasetCovariatesToCorrectForEigenvalues = new ExpressionDataset(inputDir + "/CovariatesToCorrectFor.txt.Eigenvalues.txt"); + for (int snp = 0; snp < datasetExpression.nrProbes; snp++) { + for (int cov = 0; cov < datasetCovariatesToCorrectFor.nrProbes; cov++) { + if (datasetCovariatesToCorrectForEigenvalues.rawData[cov][0] > 1E-5) { + double[] rc = getLinearRegressionCoefficients(datasetCovariatesToCorrectFor.rawData[cov], datasetExpression.rawData[snp]); + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetExpression.rawData[snp][s] -= rc[0] * datasetCovariatesToCorrectFor.rawData[cov][s]; + } + } + } + } + + + } + + + double[] mainEQTLCorr = new double[datasetGenotypes.nrProbes]; + if (1 == 1) { + System.out.println("Enforcing for every eQTL that the genotype dosage positively correlated with gene expression levels:"); + for (int snp = 0; snp < datasetGenotypes.nrProbes; snp++) { + double corr = JSci.maths.ArrayMath.correlation(datasetGenotypes.rawData[snp], datasetExpression.rawData[snp]); + //System.out.println(datasetExpression.probeNames[snp] + "\t" + snp + "\t" + corr); + + if (corr < 0) { + corr = -corr; + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetGenotypes.rawData[snp][s] = 2 - datasetGenotypes.rawData[snp][s]; + } + } + + mainEQTLCorr[snp] = corr; + } + } + + if (1 == 1) { + + if (1 == 1) { + System.out.println("Correcting covariate data for cohort specific effects:"); +// String[] cohorts = {"LLDeep","LLS","RS","CODAM"}; + ExpressionDataset datasetCovariatesToCorrectFor = new ExpressionDataset(covsToCorrect.length, datasetGenotypes.nrSamples); + datasetCovariatesToCorrectFor.sampleNames = datasetGenotypes.sampleNames; +// for (int p=0; p 1E-5) { + double[] rc = getLinearRegressionCoefficients(datasetCovariatesToCorrectFor.rawData[cov], datasetCovariates.rawData[p]); + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetCovariates.rawData[p][s] -= rc[0] * datasetCovariatesToCorrectFor.rawData[cov][s]; + } + } + } + double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[p]); + double mean = JSci.maths.ArrayMath.mean(datasetCovariates.rawData[p]); + if (stdev < 1E-5) { + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetCovariates.rawData[p][s] = mean; + } + } + } + } + + + } + + if (1 == 1) { + System.out.println("Correcting covariate data for cis-eQTL effects:"); + for (int p = 0; p < datasetCovariates.nrProbes; p++) { + if (datasetExpression.hashProbes.containsKey(datasetCovariates.probeNames[p])) { + int index = ((Integer) datasetExpression.hashProbes.get(datasetCovariates.probeNames[p])).intValue(); + double[] rc = getLinearRegressionCoefficients(datasetGenotypes.rawData[index], datasetCovariates.rawData[p]); + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetCovariates.rawData[p][s] -= rc[0] * datasetGenotypes.rawData[index][s]; + } + } + } + } + + if (1 == 2) { + datasetCovariates.save(inputDir + "/CovariatesCorrected.txt"); + HashMap hashProbesToFilter = new HashMap(); + for (int p = 0; p < datasetCovariates.nrProbes; p++) { + if (datasetCovariates.probeNames[p].startsWith("ENSG")) { + hashProbesToFilter.put(datasetCovariates.probeNames[p], null); + } + } + ExpressionDataset datasetCovariatesCorrected = new ExpressionDataset(inputDir + "/CovariatesCorrected.txt", '\t', hashProbesToFilter, null); + datasetCovariatesCorrected.transposeDataset(); + datasetCovariatesCorrected.save(inputDir + "/CovariatesCorrected.txt"); + System.exit(0); + } + + if (1 == 2) { + ExpressionDataset datasetICA = new ExpressionDataset("/Users/lude/Documents/ICA/mixingmatrix.txt"); + //ExpressionDataset datasetICA = new ExpressionDataset("/Users/lude/Documents/ICA/signals.txt"); + datasetICA.transposeDataset(); + for (int p = 0; p < datasetICA.nrProbes; p++) { + datasetCovariates.rawData[p] = datasetICA.rawData[p]; + datasetCovariates.probeNames[p] = datasetICA.probeNames[p]; + if (p == 7) { + for (int q = 0; q < datasetCovariates.nrProbes; q++) { + double corr = JSci.maths.ArrayMath.correlation(datasetICA.rawData[p], datasetCovariates.rawData[q]); + System.out.println(p + "\t" + datasetICA.probeNames[p] + "\t" + q + "\t" + datasetCovariates.probeNames[q] + "\t" + corr + "\t" + corr * corr); + } + } + } + + orthogonalizeDataset("/Users/lude/Documents/ICA/mixingmatrix.txt"); + //System.exit(0); + } + + System.out.println("Enforcing normal distribution on covariates"); + + NaturalRanking ranker = new NaturalRanking(); + + for (int p = 0; p < datasetCovariates.nrProbes; p++) { + //Rank order the expression values: + double[] values = new double[datasetCovariates.nrSamples]; + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + values[s] = datasetCovariates.rawData[p][s]; + } + double[] rankedValues = ranker.rank(values); + //Replace the original expression value with the standard distribution enforce: + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + //Convert the rank to a proportion, with range <0, 1> + double pValue = (0.5d + rankedValues[s] - 1d) / (double) (rankedValues.length); + //Convert the pValue to a Z-Score: + double zScore = cern.jet.stat.tdouble.Probability.normalInverse(pValue); + datasetCovariates.rawData[p][s] = zScore; //Replace original expression value with the Z-Score + } + } + + } + + cern.jet.random.tdouble.engine.DoubleRandomEngine randomEngine = new cern.jet.random.tdouble.engine.DRand(); + + ExpressionDataset datasetExpressionBeforeEQTLCorrection = new ExpressionDataset(datasetExpression.nrProbes, datasetExpression.nrSamples); + for (int p = 0; p < datasetExpression.nrProbes; p++) { + for (int s = 0; s < datasetExpression.nrSamples; s++) { + datasetExpressionBeforeEQTLCorrection.rawData[p][s] = datasetExpression.rawData[p][s]; + } + } + + if (1 == 1) { + System.out.println("Correcting expression data for predefined gene environment interaction effects (GC content, Gender, 5'Median Bias, 3'Median Bias):"); + int[] covsToCorrectIndex = new int[covsToCorrect.length]; + for (int c = 0; c < covsToCorrect.length; c++) { + covsToCorrectIndex[c] = ((Integer) datasetCovariates.hashProbes.get(covsToCorrect[c])).intValue(); + } + for (int snp = 0; snp < datasetGenotypes.nrProbes; snp++) { + double[][] valsX = new double[nrSamples][1 + covsToCorrect.length * 2]; //store genotypes, covariates, interactions + for (int s = 0; s < nrSamples; s++) { + valsX[s][0] = datasetGenotypes.rawData[snp][s]; //genotypes + } + for (int c = 0; c < covsToCorrect.length; c++) { + for (int s = 0; s < nrSamples; s++) { + valsX[s][c * 2 + 1] = datasetCovariates.rawData[covsToCorrectIndex[c]][s]; //covariate + valsX[s][c * 2 + 2] = valsX[s][0] * valsX[s][c * 2 + 1]; //interction + } + } + double[] valsY = datasetExpression.rawData[snp]; + regression.newSampleData(valsY, valsX); + datasetExpression.rawData[snp] = regression.estimateResiduals(); + } + } + + + if (1 == 1) { + System.out.println("Enforcing normal distribution on expression data:"); + + NaturalRanking ranker = new NaturalRanking(); + + for (int p = 0; p < datasetExpression.nrProbes; p++) { + //Rank order the expression values: + double[] values = new double[datasetExpression.nrSamples]; + for (int s = 0; s < datasetExpression.nrSamples; s++) { + values[s] = datasetExpression.rawData[p][s]; + } + + double[] rankedValues = ranker.rank(values); + //Replace the original expression value with the standard distribution enforce: + for (int s = 0; s < datasetExpression.nrSamples; s++) { + //Convert the rank to a proportion, with range <0, 1> + double pValue = (0.5d + rankedValues[s] - 1d) / (double) (rankedValues.length); + //Convert the pValue to a Z-Score: + double zScore = cern.jet.stat.tdouble.Probability.normalInverse(pValue); + datasetExpression.rawData[p][s] = zScore; //Replace original expression value with the Z-Score + } + } + + System.out.println("Expression data now force normal"); + + } + + + CSVReader reader = new CSVReader(new FileReader(genesFile), '\t', CSVWriter.NO_QUOTE_CHARACTER); + String[] nextLine; + while ((nextLine = reader.readNext()) != null) { + + String eQtlGene = nextLine[0]; + + System.out.println(eQtlGene); + + Integer eQtlGeneI = datasetExpression.hashProbes.get(eQtlGene); + Integer covariateI = datasetCovariates.hashProbes.get(covariate); + Integer snpI = eQtlGeneI; + + makeInteractionPlot(outputDir + "/" + covariate + "-" + eQtlGene + ".png", datasetGenotypes.rawData[snpI], datasetExpression.rawData[eQtlGeneI], datasetCovariates.rawData[covariateI]); + + } + + } + + public static void makeInteractionPlot(String fileName, double[] genotype, double[] expression, double[] covariate) { + + int nrSamples = genotype.length; + +// int[] cohortIndex = new int[4]; +// String[] cohorts = {"LLDeep", "LLS", "RS", "CODAM"}; +// for (int cohort = 0; cohort < cohorts.length; cohort++) { +// for (int s = 0; s < nrSamples; s++) { +// if (sampleNames[s].startsWith(cohorts[cohort])) { +// cohortIndex[cohort] = s; +// break; +// } +// } +// } + + int marginLeft = 100; + int marginRight = 200; + int marginTop = 100; + int marginBottom = 100; + int innerHeight = 500; + int innerWidth = 500; + int docWidth = marginLeft + marginRight + innerWidth; + int docHeight = marginTop + marginBottom + innerHeight; + + BufferedImage bimage = new BufferedImage(docWidth, docHeight, BufferedImage.TYPE_INT_RGB); + Graphics2D g2d = bimage.createGraphics(); + + g2d.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON); + g2d.setColor(new Color(255, 255, 255)); + g2d.fillRect(0, 0, docWidth, docHeight); + java.awt.AlphaComposite alphaComposite10 = java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_OVER, 0.10f); + java.awt.AlphaComposite alphaComposite25 = java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_OVER, 0.25f); + java.awt.AlphaComposite alphaComposite50 = java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_OVER, 0.50f); + java.awt.AlphaComposite alphaComposite100 = java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC, 1.00f); + + float fontSize = 12f; + java.awt.Font font = new java.awt.Font("Gill Sans MT", java.awt.Font.PLAIN, (int) fontSize); + java.awt.Font fontBold = new java.awt.Font("Gill Sans MT", java.awt.Font.BOLD, (int) fontSize); + java.awt.Font fontSmall = new java.awt.Font("Gill Sans MT", java.awt.Font.PLAIN, 8); + java.awt.Font fontBoldSmall = new java.awt.Font("Gill Sans MT", java.awt.Font.BOLD, 8); + + java.awt.Color dataColor[] = new Color[10]; + dataColor[0] = new java.awt.Color(167, 72, 20); + dataColor[1] = new java.awt.Color(62, 138, 20); + dataColor[2] = new java.awt.Color(228, 171, 0); + dataColor[3] = new java.awt.Color(0, 148, 183); + dataColor[4] = new java.awt.Color(119, 80, 152); + dataColor[5] = new java.awt.Color(106, 106, 106); + dataColor[6] = new java.awt.Color(212, 215, 10); + dataColor[7] = new java.awt.Color(210, 111, 0); + dataColor[8] = new java.awt.Color(0, 0, 141); + dataColor[9] = new java.awt.Color(190, 190, 190); + + g2d.setComposite(alphaComposite50); + g2d.setColor(new Color(0, 0, 0)); + g2d.drawLine(marginLeft, marginTop, marginLeft, marginTop + innerHeight); + g2d.drawLine(marginLeft, marginTop + innerHeight, marginLeft + innerWidth, marginTop + innerHeight); + + double minX = JSci.maths.ArrayMath.min(covariate); + double maxX = JSci.maths.ArrayMath.max(covariate); + double minY = JSci.maths.ArrayMath.min(expression); + double maxY = JSci.maths.ArrayMath.max(expression); + + g2d.setComposite(alphaComposite10); + for (int rep = 0; rep >= 0; rep--) { + for (int s = 0; s < nrSamples; s++) { + int posY = marginTop + innerHeight - (int) ((expression[s] - minY) / (maxY - minY) * innerHeight); + int posX = marginLeft + (int) ((covariate[s] - minX) / (maxX - minX) * innerWidth); + if (genotype[s] < 0.5) { + g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_ATOP, 0.30f - (float) rep / 10f)); + g2d.setColor(new Color(204, 86, 78)); + } else { + if (genotype[s] > 1.5) { + g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_ATOP, 0.30f - (float) rep / 10f)); + g2d.setColor(new Color(171, 178, 114)); + } else { + g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_ATOP, 0.30f - (float) rep / 10f)); + g2d.setColor(new Color(98, 175, 255)); + } + } + + g2d.fillOval(posX - 5 - rep * 4, posY - 5 - rep * 4, 7 + rep * 8, 7 + rep * 8); + + } + } + + //Draw the four independent cohorts seperately: + //int[] cohortIndex = {0,626,1280,1933}; +// for (int rep = 2; rep >= 0; rep--) { +// for (int s = 0; s < nrSamples; s++) { +// int cohort = 0; +// for (int c = 0; c < cohortIndex.length; c++) { +// if (s >= cohortIndex[c]) { +// cohort = c; +// } +// } +// +// int posY = marginTop + 100 + cohort * 125 - (int) ((expression[s] - minY) / (maxY - minY) * 100); +// int posX = marginLeft + innerWidth + 50 + (int) ((covariate[s] - minX) / (maxX - minX) * 100); +// if (genotype[s] < 0.5) { +// g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_ATOP, 0.30f - (float) rep / 10f)); +// g2d.setColor(new Color(204, 86, 78)); +// } else { +// if (genotype[s] > 1.5) { +// g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_ATOP, 0.30f - (float) rep / 10f)); +// g2d.setColor(new Color(171, 178, 114)); +// } else { +// g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_ATOP, 0.30f - (float) rep / 10f)); +// g2d.setColor(new Color(98, 175, 255)); +// } +// } +// g2d.fillOval(posX - 1 - rep * 2, posY - 1 - rep * 2, 3 + rep * 4, 3 + rep * 4); +// +// } +// } + + + g2d.setComposite(alphaComposite50); + double[][] valsX = new double[nrSamples][3]; + for (int s = 0; s < nrSamples; s++) { + valsX[s][0] = genotype[s]; + valsX[s][1] = covariate[s]; + valsX[s][2] = valsX[s][0] * valsX[s][1]; + } + double[] valsY = expression; + org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression regression = new org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression(); + regression.newSampleData(valsY, valsX); + double[] betas = regression.estimateRegressionParameters(); + double betaInteraction = betas[3]; + double seInteraction = regression.estimateRegressionParametersStandardErrors()[3]; + double tInteraction = betaInteraction / seInteraction; + double pValueInteraction = 1; + double zScoreInteraction = 0; + cern.jet.random.tdouble.engine.DoubleRandomEngine randomEngine = new cern.jet.random.tdouble.engine.DRand(); + cern.jet.random.tdouble.StudentT tDistColt = new cern.jet.random.tdouble.StudentT(genotype.length - 4, randomEngine); + if (tInteraction < 0) { + pValueInteraction = tDistColt.cdf(tInteraction); + if (pValueInteraction < 2.0E-323) { + pValueInteraction = 2.0E-323; + } + zScoreInteraction = cern.jet.stat.tdouble.Probability.normalInverse(pValueInteraction); + } else { + pValueInteraction = tDistColt.cdf(-tInteraction); + if (pValueInteraction < 2.0E-323) { + pValueInteraction = 2.0E-323; + } + zScoreInteraction = -cern.jet.stat.tdouble.Probability.normalInverse(pValueInteraction); + } + pValueInteraction *= 2; + + String pValueString = (new java.text.DecimalFormat("0.##E0", new java.text.DecimalFormatSymbols(java.util.Locale.US))).format(pValueInteraction); + if (pValueInteraction > 0.001) { + pValueString = (new java.text.DecimalFormat("##.###;-##.###", new java.text.DecimalFormatSymbols(java.util.Locale.US))).format(pValueInteraction); + } + g2d.setFont(new java.awt.Font("Arial", java.awt.Font.BOLD, 14)); + g2d.setColor(new Color(0, 0, 0)); + int posX = marginLeft; + int posY = marginTop + innerHeight + 20; + g2d.drawString("Interaction P-Value: " + pValueString, posX, posY); + + + for (int g = 0; g <= 2; g++) { + + double valMin = betas[0] + betas[1] * g + minX * betas[2] + betas[3] * g * minX; + double valMax = betas[0] + betas[1] * g + maxX * betas[2] + betas[3] * g * maxX; + int posXMin = marginLeft + (int) ((minX - minX) / (maxX - minX) * innerWidth); + int posYMin = marginTop + innerHeight - (int) ((valMin - minY) / (maxY - minY) * innerHeight); + int posXMax = marginLeft + (int) ((maxX - minX) / (maxX - minX) * innerWidth); + int posYMax = marginTop + innerHeight - (int) ((valMax - minY) / (maxY - minY) * innerHeight); + + g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_OVER, 0.8f)); + g2d.setColor(new Color(255, 255, 255)); + g2d.setStroke(new java.awt.BasicStroke(5.0f, java.awt.BasicStroke.CAP_ROUND, java.awt.BasicStroke.JOIN_ROUND)); + g2d.drawLine(posXMin, posYMin, posXMax, posYMax); + if (g < 0.5) { + g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_OVER, 0.30f)); + g2d.setColor(new Color(204, 86, 78)); + } else { + if (g > 1.5) { + g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_OVER, 0.50f)); + g2d.setColor(new Color(171, 178, 114)); + } else { + g2d.setComposite(java.awt.AlphaComposite.getInstance(java.awt.AlphaComposite.SRC_OVER, 0.50f)); + g2d.setColor(new Color(98, 175, 255)); + } + } + g2d.setStroke(new java.awt.BasicStroke(3.0f, java.awt.BasicStroke.CAP_ROUND, java.awt.BasicStroke.JOIN_ROUND)); + g2d.drawLine(posXMin, posYMin, posXMax, posYMax); + + } + + try { + javax.imageio.ImageIO.write(bimage, "png", new File(fileName)); + } catch (IOException e) { + System.out.println(e.getMessage()); + e.printStackTrace(); + } + + + } +} diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/PerformInteractionAnalysisPermutationTask.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/PerformInteractionAnalysisPermutationTask.java new file mode 100644 index 000000000..f31b5e6f2 --- /dev/null +++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/PerformInteractionAnalysisPermutationTask.java @@ -0,0 +1,118 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser; + +import gnu.trove.set.hash.TIntHashSet; +import java.util.concurrent.Callable; +import org.apache.commons.math3.linear.SingularMatrixException; +import org.apache.commons.math3.stat.regression.SimpleRegression; + +/** + * + * @author lude + */ +public class PerformInteractionAnalysisPermutationTask implements Callable { + + public ExpressionDataset datasetGenotypes; + public ExpressionDataset datasetExpression; + public ExpressionDataset datasetCovariates; + ExpressionDataset datasetCovariatesPCAForceNormal; + public int covToTest = -1; + public int nrSamples = -1; + public org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression regression = null; + public cern.jet.random.tdouble.StudentT tDistColt = null; + private final SkippedInteractionTracker skippedTracker; + private final SkippedInteractionWriter skippedWriter; + private final TIntHashSet snpsToTest; + + public PerformInteractionAnalysisPermutationTask(ExpressionDataset datasetGenotypes, ExpressionDataset datasetExpression, ExpressionDataset datasetCovariates, ExpressionDataset datasetCovariatesPCAForceNormal, int covToTest, SkippedInteractionWriter skippedWriter, final TIntHashSet snpsToTest) { + this.datasetGenotypes = datasetGenotypes; + this.datasetExpression = datasetExpression; + this.datasetCovariates = datasetCovariates; + this.datasetCovariatesPCAForceNormal = datasetCovariatesPCAForceNormal; + this.covToTest = covToTest; + this.nrSamples = datasetGenotypes.nrSamples; + this.skippedTracker = new SkippedInteractionTracker(datasetCovariates.probeNames[covToTest]); + this.skippedWriter = skippedWriter; + this.snpsToTest = snpsToTest; + + this.regression = new org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression(); + cern.jet.random.tdouble.engine.DoubleRandomEngine randomEngine = new cern.jet.random.tdouble.engine.DRand(); + this.tDistColt = new cern.jet.random.tdouble.StudentT(this.nrSamples - 4, randomEngine); + + } + + @Override + public DoubleArrayIntegerObject call() throws Exception { + double corrPvalueThreshold = 0.0001; + double[] zScores = new double[datasetGenotypes.nrProbes]; + for (int snp = 0; snp < datasetGenotypes.nrProbes; snp++) { + + if(snpsToTest != null && !snpsToTest.contains(snp)){ + continue; + } + + double corrPvalue = correlateCovariateWithGenotype(snp); + if (corrPvalue > corrPvalueThreshold) { // don't compute the interaction if the covariate expression is affected by theis SNP + try { + + double[][] valsX = new double[nrSamples][3]; + for (int s = 0; s < nrSamples; s++) { + valsX[s][0] = datasetGenotypes.rawData[snp][s]; + valsX[s][1] = datasetCovariates.rawData[covToTest][s]; + valsX[s][2] = valsX[s][0] * valsX[s][1]; + } + double[] valsY = datasetExpression.rawData[snp]; + regression.newSampleData(valsY, valsX); + double betaInteraction = regression.estimateRegressionParameters()[3]; + double seInteraction = regression.estimateRegressionParametersStandardErrors()[3]; + double tInteraction = betaInteraction / seInteraction; + double pValueInteraction = 1; + double zScoreInteraction = 0; + if (tInteraction < 0) { + pValueInteraction = tDistColt.cdf(tInteraction); + if (pValueInteraction < 2.0E-323) { + pValueInteraction = 2.0E-323; + } + zScoreInteraction = cern.jet.stat.tdouble.Probability.normalInverse(pValueInteraction); + } else { + pValueInteraction = tDistColt.cdf(-tInteraction); + if (pValueInteraction < 2.0E-323) { + pValueInteraction = 2.0E-323; + } + zScoreInteraction = -cern.jet.stat.tdouble.Probability.normalInverse(pValueInteraction); + } + zScores[snp] = zScoreInteraction; + } catch (SingularMatrixException e) { + zScores[snp] = 0; + skippedTracker.addSkipped(SkippedInteractionTracker.Reason.SINGULAR, datasetGenotypes.probeNames[snp]); + } + } + else{ + //System.out.println("Removing covariate because of eQTL effect! " + datasetCovariatesPCAForceNormal.probeNames[covToTest] + " : " + datasetGenotypes.probeNames[snp]); + skippedTracker.addSkipped(SkippedInteractionTracker.Reason.SHARED_QTL, datasetGenotypes.probeNames[snp]); + zScores[snp] = 0; + } + + } + skippedWriter.add(skippedTracker); + return new DoubleArrayIntegerObject(zScores, covToTest); + } + + private double correlateCovariateWithGenotype(int snp){ + SimpleRegression simpleRegression = new SimpleRegression(); + double[] expression = datasetCovariatesPCAForceNormal.rawData[covToTest]; + double[] genotypes = datasetGenotypes.rawData[snp]; + for (int s = 0; s < expression.length; s++) { + simpleRegression.addData(expression[s], genotypes[s]); + } + //This is not working now that we have the _rs next to the gene names +// if (datasetGenotypes.probeNames[snp].equals(datasetCovariatesPCAForceNormal.probeNames[covToTest])){ +// System.out.println("Same gene! " + datasetGenotypes.probeNames[snp] + "\t" + datasetCovariatesPCAForceNormal.probeNames[covToTest] + "\t" + simpleRegression.getSignificance() + "\t" + simpleRegression.getR()); +// } + return simpleRegression.getSignificance(); + } +} diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/SkippedInteractionTracker.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/SkippedInteractionTracker.java new file mode 100644 index 000000000..6c55604df --- /dev/null +++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/SkippedInteractionTracker.java @@ -0,0 +1,40 @@ +package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.EnumMap; + +/** + * + * @author Patrick Deelen + */ +public class SkippedInteractionTracker { + + public static enum Reason { + SINGULAR, SHARED_QTL + } + + private final String covariate; + EnumMap> skipped; + + public SkippedInteractionTracker(String covariate) { + this.covariate = covariate; + skipped = new EnumMap>(Reason.class); + for(Reason r : Reason.values()){ + skipped.put(r, new ArrayList()); + } + } + + public void addSkipped(Reason r, String qtl){ + skipped.get(r).add(qtl); + } + + public String getCovariate() { + return covariate; + } + + public EnumMap> getSkipped() { + return skipped; + } + +} diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/SkippedInteractionWriter.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/SkippedInteractionWriter.java new file mode 100644 index 000000000..abaa52563 --- /dev/null +++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/SkippedInteractionWriter.java @@ -0,0 +1,65 @@ +package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser; + +import au.com.bytecode.opencsv.CSVWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; + +/** + * + * @author Patrick Deelen + */ +public class SkippedInteractionWriter { + + private final CSVWriter writer; + private final String[] row = new String[5]; + private int c; + private StringBuilder tmp; + + public SkippedInteractionWriter(File skippedInteractionsFile) throws IOException { + writer = new CSVWriter(new FileWriter(skippedInteractionsFile), '\t', CSVWriter.NO_QUOTE_CHARACTER); + + c = 0; + row[c++] = "Covariate"; + row[c++] = "CountSingular"; + row[c++] = "CountSharedQtl"; + row[c++] = "SingularQtls"; + row[c++] = "SharedQtls"; + + writer.writeNext(row); + } + + public void close() throws IOException{ + writer.close(); + } + + synchronized void add(SkippedInteractionTracker skipped){ + + ArrayList singular = skipped.getSkipped().get(SkippedInteractionTracker.Reason.SINGULAR); + ArrayList sharedQtl = skipped.getSkipped().get(SkippedInteractionTracker.Reason.SHARED_QTL); + + c = 0; + row[c++] = skipped.getCovariate(); + row[c++] = String.valueOf(singular.size()); + row[c++] = String.valueOf(sharedQtl.size()); + + tmp = new StringBuilder(); + for(String qtl : singular){ + tmp.append(qtl); + tmp.append(';'); + } + row[c++] = tmp.toString(); + + tmp = new StringBuilder(); + for(String qtl : sharedQtl){ + tmp.append(qtl); + tmp.append(';'); + } + row[c++] = tmp.toString(); + + writer.writeNext(row); + + } + +} diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/StringIntegerObject.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/StringIntegerObject.java new file mode 100644 index 000000000..07016fac6 --- /dev/null +++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/StringIntegerObject.java @@ -0,0 +1,21 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser; + +/** + * + * @author ludefranke + */ +public class StringIntegerObject { + + public String stringValue; + public int intValue; + public StringIntegerObject(String stringValue, int intValue) { + this.stringValue = stringValue; + this.intValue = intValue; + } + +} diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/StringIntegerObjectSorter.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/StringIntegerObjectSorter.java new file mode 100644 index 000000000..bb25906f8 --- /dev/null +++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/StringIntegerObjectSorter.java @@ -0,0 +1,36 @@ +/* + * GeneLocationObjectSorter.java + * + * Created on 23 December 2003, 17:14 + */ + +package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser; + +/** + * + * @author Like + */ +public class StringIntegerObjectSorter extends VectorSorter { + + private java.text.Collator collatorUS = null; + + /** Creates a new instance of GeneLocationObjectSorter */ + public StringIntegerObjectSorter() { + super(); + collatorUS = java.text.Collator.getInstance(java.util.Locale.US); + } + + /** Override object comparer + * @param a the first GeneLocationObject to be compared + * @param b the second GeneLocationObject to be compared + * @return true if the first GeneLocationObject.getChrStart() is lower than the second one + */ + protected boolean lt (Object a, Object b) { + if (collatorUS.compare(((StringIntegerObject)a).stringValue, ((StringIntegerObject)b).stringValue) >= 0) { + return false; + } else { + return true; + } + } + +} \ No newline at end of file diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/TestEQTLDatasetForInteractions.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/TestEQTLDatasetForInteractions.java new file mode 100644 index 000000000..9611fa474 --- /dev/null +++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/TestEQTLDatasetForInteractions.java @@ -0,0 +1,1540 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser; + +import au.com.bytecode.opencsv.CSVReader; +import au.com.bytecode.opencsv.CSVWriter; +import com.google.common.collect.HashMultimap; +import gnu.trove.set.hash.TIntHashSet; +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Writer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.Vector; +import java.util.concurrent.CompletionService; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.Executors; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.math3.exception.MathIllegalArgumentException; +import org.apache.commons.math3.stat.descriptive.moment.Variance; +import org.apache.commons.math3.stat.ranking.NaturalRanking; +import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression; +import org.apache.mahout.math.Arrays; +import umcg.genetica.genomicboundaries.GenomicBoundary; +import umcg.genetica.io.Gpio; +import umcg.genetica.io.text.TextFile; +import umcg.genetica.io.trityper.EQTL; +import umcg.genetica.io.trityper.QTLTextFile; + +/** + * + * @author lude + */ +public class TestEQTLDatasetForInteractions { + + String inputDir = null; + String outputDir = null; + HashMap> geneDistanceMap = null; + String[] primaryCovsToCorrect; + ExpressionDataset datasetGenotypes; + + public TestEQTLDatasetForInteractions(String inputDir, String outputDir) throws IOException { + + this.inputDir = inputDir; + this.outputDir = outputDir; + primaryCovsToCorrect = new String[]{"gender", "GC", "MEDIAN_5PRIME_BIAS", "MEDIAN_3PRIME_BIAS", "LLdeep", "RS", "CODAM", "LLS"}; + //preprocessData(); + } + + public TestEQTLDatasetForInteractions(String inputDir, String outputDir, String eQTLfileName, int maxNumTopCovs, String annotationFile, String[] covariatesToCorrect, String[] covariatesToCorrect2, File snpsToSwapFile, boolean permute, String[] covariatesToTest, HashMap hashSamples, int numThreads, String[] cohorts, File snpsToTestFile, boolean skipNormalization, boolean skipCovariateNormalization, String eQTLfileNameCovariates) throws IOException, Exception { + + System.out.println("Input dir: " + inputDir); + System.out.println("Output dir: " + outputDir); + System.out.println("eQTL file: " + eQTLfileName); + System.out.println("eQTL file covariates: " + eQTLfileNameCovariates); + System.out.println("Maximum number of covariates to regress out: " + maxNumTopCovs); + System.out.println("Covariates to correct for with interaction: " + Arrays.toString(covariatesToCorrect)); + System.out.println("Covariates to correct for without interaction: " + Arrays.toString(covariatesToCorrect2)); + if (covariatesToTest != null) { + System.out.println("Covariates to test: " + Arrays.toString(covariatesToTest)); + } + + this.inputDir = inputDir; + this.outputDir = outputDir; + primaryCovsToCorrect = covariatesToCorrect; + if (!Gpio.exists(outputDir)) { + Gpio.createDir(outputDir); + } + + initGenotypes(permute, hashSamples, cohorts); + + final HashMultimap qtlProbeSnpMultiMap = HashMultimap.create(); + if (eQTLfileName != null) { + final QTLTextFile eQtlFileReader = new QTLTextFile(eQTLfileName, false); + for (Iterator it = eQtlFileReader.getEQtlIterator(); it.hasNext();) { + EQTL qtl = it.next(); + qtlProbeSnpMultiMap.put(qtl.getProbe(), qtl.getRsName()); + } + } + + final HashMultimap qtlProbeSnpMultiMapCovariates; + if(eQTLfileNameCovariates != null){ + qtlProbeSnpMultiMapCovariates = HashMultimap.create(); + final QTLTextFile eQtlFileReader = new QTLTextFile(eQTLfileNameCovariates, false); + for (Iterator it = eQtlFileReader.getEQtlIterator(); it.hasNext();) { + EQTL qtl = it.next(); + qtlProbeSnpMultiMapCovariates.put(qtl.getProbe(), qtl.getRsName()); + } + } else { + qtlProbeSnpMultiMapCovariates = qtlProbeSnpMultiMap; + } + + if (annotationFile != null) { + createGeneDistanceMap(annotationFile); + } + + final TIntHashSet snpsToTest; + if (snpsToTestFile != null) { + + snpsToTest = new TIntHashSet(); + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(snpsToTestFile), "UTF-8")); + + String line; + while ((line = reader.readLine()) != null) { + Integer genotypeI = datasetGenotypes.hashProbes.get(line); + + if (genotypeI == null) { + System.out.println("SNP " + line + " not found in genotype data"); + continue; + } + + if (!snpsToTest.add(genotypeI)) { + System.out.println("Warning including SNP twice: " + line); + } + + } + + System.out.println("Confining testing to: " + snpsToTest.size() + " SNPs from: " + snpsToTestFile.getAbsolutePath()); + + } else { + snpsToTest = null; + } + + //preprocessData(); + + TextFile outputTopCovs = new TextFile(outputDir + "/outputTopCovariates.txt", true); + + System.out.print("\nPrimary covariates to correct for before running interaction analysis: "); + for (String cov : primaryCovsToCorrect) { + System.out.print("\n\t" + cov); + } + System.out.println(); + + + + String[] covsToCorrect = primaryCovsToCorrect; + int cnt = 0; + while (cnt < maxNumTopCovs) { + String topCov = performInteractionAnalysis(covsToCorrect, covariatesToCorrect2, outputTopCovs, snpsToSwapFile, qtlProbeSnpMultiMap, covariatesToTest, hashSamples, numThreads, snpsToTest, skipNormalization, skipCovariateNormalization, qtlProbeSnpMultiMapCovariates); + String[] covsToCorrectNew = new String[covsToCorrect.length + 1]; + for (int c = 0; c < covsToCorrect.length; c++) { + covsToCorrectNew[c] = covsToCorrect[c]; + } + covsToCorrectNew[covsToCorrect.length] = topCov; + covsToCorrect = covsToCorrectNew; + cnt++; + } + outputTopCovs.close(); + } + + private void initGenotypes(boolean permute, HashMap hashSamples, String[] cohorts) { + + datasetGenotypes = new ExpressionDataset(inputDir + "/bigTableLude.txt.Genotypes.binary", '\t', null, hashSamples); + + if (permute) { + System.out.println("WARNING: PERMUTING GENOTYPE DATA!!!!"); + if (cohorts == null) { + cohorts = new String[]{"LLDeep", "LLS", "RS", "CODAM"}; + } + int[] permSampleIDs = new int[datasetGenotypes.nrSamples]; + for (int p = 0; p < cohorts.length; p++) { + Vector vecSamples = new Vector(); + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + //if (datasetGenotypes.sampleNames[s].startsWith(cohorts[p])) { + vecSamples.add(s); + //} + } + + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + //if (datasetGenotypes.sampleNames[s].startsWith(cohorts[p])) { + int randomSample = ((Integer) vecSamples.remove((int) ((double) vecSamples.size() * Math.random()))).intValue(); + permSampleIDs[s] = randomSample; + //} + } + } + + ExpressionDataset datasetGenotypes2 = new ExpressionDataset(datasetGenotypes.nrProbes, datasetGenotypes.nrSamples); + datasetGenotypes2.probeNames = datasetGenotypes.probeNames; + datasetGenotypes2.sampleNames = datasetGenotypes.sampleNames; + datasetGenotypes2.recalculateHashMaps(); + for (int p = 0; p < datasetGenotypes2.nrProbes; p++) { + for (int s = 0; s < datasetGenotypes2.nrSamples; s++) { + datasetGenotypes2.rawData[p][s] = datasetGenotypes.rawData[p][permSampleIDs[s]]; + } + } + datasetGenotypes = datasetGenotypes2; + } + + } + + /** + * Extracts eQTL gene names + * + * @param fname - eQTL file (in the eqtlmappingpipeline format) + * @return gene names in keys of a HashMap + * @throws IOException + */ + public static HashMap getEqtls(String fname) throws IOException { + if (fname == null) { + return null; + } + TextFile file = new TextFile(fname, false); + ArrayList genes = file.readAsArrayList(4, TextFile.tab); + HashMap eqtlGenes = new HashMap(); + for (String gene : genes) { + eqtlGenes.put(gene, null); + } + file.close(); + return eqtlGenes; + + } + + public void interpretInteractionZScoreMatrix(int maxNumRegressedCovariates, int numPrimaryCovsToCorrect, int zscoreDiffThreshold) throws IOException { + + System.out.println("Interpreting the z-score matrix"); + + for (int nrCovsRemoved = numPrimaryCovsToCorrect; nrCovsRemoved < numPrimaryCovsToCorrect + maxNumRegressedCovariates; nrCovsRemoved++) { + if (! new File(outputDir + "/InteractionZScoresMatrix-" + nrCovsRemoved + "Covariates.txt.binary.dat").exists()) { + ExpressionDataset dataset = new ExpressionDataset(outputDir + "/InteractionZScoresMatrix-" + nrCovsRemoved + "Covariates.txt"); + dataset.save(dataset.fileName + ".binary"); + } + else { + System.out.println("Binary z-score matrix already exists, not overwriting it: " + outputDir + "/InteractionZScoresMatrix-" + nrCovsRemoved + "Covariates.txt.binary.dat"); + } + } + + TextFile out = new TextFile(outputDir + "zscoreDiff.txt", true); + out.writeln("numCovsRemoved\tcovariate\teQTL\tz-score_before\tz-score_after\tdifference"); + for (int nrCovsRemoved = numPrimaryCovsToCorrect; nrCovsRemoved < numPrimaryCovsToCorrect + maxNumRegressedCovariates; nrCovsRemoved++) { + + ExpressionDataset dataset = new ExpressionDataset(outputDir + "/InteractionZScoresMatrix-" + nrCovsRemoved + "Covariates.txt.binary"); + ExpressionDataset dataset2 = new ExpressionDataset(outputDir + "/InteractionZScoresMatrix-" + (nrCovsRemoved + 1) + "Covariates.txt.binary"); + + for (int q = 0; q < dataset.nrSamples; q++) { + double maxAbsZDiff = 0; + String output = ""; + for (int p = 0; p < dataset.nrProbes; p++) { + double zDiff = dataset.rawData[p][q] - dataset2.rawData[p][q]; + double absZDiff = Math.abs(zDiff); + if (absZDiff > 2 && absZDiff > maxAbsZDiff) { + maxAbsZDiff = absZDiff; + output = nrCovsRemoved + "\t" + dataset.probeNames[p] + "\t" + dataset.sampleNames[q] + "\t" + dataset.rawData[p][q] + "\t" + dataset2.rawData[p][q] + "\t" + zDiff; + } + } + if (maxAbsZDiff > zscoreDiffThreshold) { + System.out.println(output); + out.writeln(output); + } + } + } + out.close(); + } + + public void findChi2SumDifferences(int maxNumRegressedCovariates, int numPrimaryCovsToCorrect, File ensgAnnotationFile) throws IOException { + + Map ensgAnnotations; + if (ensgAnnotationFile == null) { + ensgAnnotations = Collections.emptyMap(); + } else { + ensgAnnotations = readEnsgAnnotations(ensgAnnotationFile); + } + + double[][] topCovZscores = null; + String[] topCovs = new String[maxNumRegressedCovariates]; + String[] genes = null; + + System.out.println("Interpreting the z-score matrix"); + System.out.println("Preparing the data"); + for (int nrCovsRemoved = numPrimaryCovsToCorrect; nrCovsRemoved < numPrimaryCovsToCorrect + maxNumRegressedCovariates; nrCovsRemoved++) { + + if (new File(inputDir + "/InteractionZScoresMatrix-" + nrCovsRemoved + "Covariates.txt.binary.dat").exists()) { + System.out.println(""); + System.out.println("USING EXISTING BINARY FILE!!!!"); + System.out.println(""); + continue; + } + + ExpressionDataset dataset = new ExpressionDataset(inputDir + "/InteractionZScoresMatrix-" + nrCovsRemoved + "Covariates.txt"); + dataset.save(dataset.fileName + ".binary"); + } + + System.out.println("Comparing chi2sums"); + + double[] previousChi2 = null; + String[][] output = null; + boolean firstDataset = true; + String[] header = null; + double topCovChi2; + String topCov = "Technical"; + int topCovI = -1; + + for (int nrCovsRemoved = numPrimaryCovsToCorrect; nrCovsRemoved < numPrimaryCovsToCorrect + maxNumRegressedCovariates; nrCovsRemoved++) { + + ExpressionDataset dataset = new ExpressionDataset(inputDir + "/InteractionZScoresMatrix-" + nrCovsRemoved + "Covariates.txt.binary"); + + if (firstDataset) { + previousChi2 = new double[dataset.nrProbes]; + output = new String[dataset.nrProbes][(maxNumRegressedCovariates * 2) + 5]; + for (int covariate = 0; covariate < dataset.nrProbes; covariate++) { + output[covariate][0] = dataset.probeNames[covariate]; + GeneAnnotation geneAnnotation = ensgAnnotations.get(dataset.probeNames[covariate]); + if (geneAnnotation == null) { + output[covariate][1] = ""; + output[covariate][2] = ""; + output[covariate][3] = ""; + output[covariate][4] = ""; + } else { + output[covariate][1] = geneAnnotation.getHuho(); + output[covariate][2] = geneAnnotation.getChr(); + output[covariate][3] = String.valueOf(geneAnnotation.getStart()); + output[covariate][4] = String.valueOf(geneAnnotation.getEnd()); + } + + } + header = new String[(maxNumRegressedCovariates * 2) + 5]; + header[0] = "Covariate gene"; + header[1] = "Gene symbol"; + header[2] = "Chr"; + header[3] = "Start"; + header[4] = "End"; + + genes = dataset.sampleNames; + topCovZscores = new double[genes.length][maxNumRegressedCovariates]; + } + + int outputColOffset = 5 + (nrCovsRemoved - numPrimaryCovsToCorrect) * 2; + + header[outputColOffset] = topCov + "_removed_chi2sum"; + header[1 + outputColOffset] = "Difference"; + + topCovChi2 = 0; + + for (int covariate = 0; covariate < dataset.nrProbes; covariate++) { + double chi2Sum = 0; + double[] covariateData = dataset.rawData[covariate]; + for (int gene = 0; gene < dataset.nrSamples; gene++) { + chi2Sum += covariateData[gene] * covariateData[gene]; + } + + if (chi2Sum > topCovChi2 && !dataset.probeNames[covariate].startsWith("Comp") && !dataset.probeNames[covariate].equals("LLS") && !dataset.probeNames[covariate].equals("LLdeep") && !dataset.probeNames[covariate].equals("RS") && !dataset.probeNames[covariate].equals("CODAM")) { + topCovChi2 = chi2Sum; + topCov = dataset.probeNames[covariate]; + topCovI = covariate; + } + + output[covariate][outputColOffset] = String.valueOf(chi2Sum); + output[covariate][1 + outputColOffset] = firstDataset ? "0" : String.valueOf(previousChi2[covariate] - chi2Sum); + previousChi2[covariate] = chi2Sum; + + //System.out.println(nrCovsRemoved + "\t" + dataset.probeNames[covariate] + "\t" + chi2Sum1 + "\t" + chi2Sum2 + "\t" + (chi2Sum1 - chi2Sum2)); + } + + topCovs[nrCovsRemoved - numPrimaryCovsToCorrect] = topCov; + double[] covariateData = dataset.rawData[topCovI]; + for (int gene = 0; gene < dataset.nrSamples; gene++) { + topCovZscores[gene][nrCovsRemoved - numPrimaryCovsToCorrect] = covariateData[gene]; + } + + firstDataset = false; + + } + + CSVWriter writer = new CSVWriter(new FileWriter(outputDir + "/chi2diff.txt"), '\t', CSVWriter.NO_QUOTE_CHARACTER); + writer.writeNext(header); + for (String[] row : output) { + writer.writeNext(row); + } + writer.close(); + + ExpressionDataset topCovZDataset = new ExpressionDataset(genes.length, topCovs.length); + topCovZDataset.rawData = topCovZscores; + topCovZDataset.probeNames = genes; + topCovZDataset.sampleNames = topCovs; + topCovZDataset.save(outputDir + "/topCovZ.txt"); + + + } + + public void preprocessData() { + + HashMap hashGenotypes = new HashMap(); + HashMap hashExpression = new HashMap(); + HashMap hashEQTLs = new HashMap(); + ArrayList snps = new ArrayList(); + int countExcludedLines = 0; + + try { + java.io.BufferedReader in = new java.io.BufferedReader(new java.io.FileReader(new File(inputDir + "/bigTableLude.txt"))); + String str = in.readLine(); + String[] data = str.split("\t"); + for (int d = 0; d < data.length; d++) { + System.out.println(d + "\t" + data[d]); + if (data[d].endsWith("_dosage")) { + hashGenotypes.put(data[d], null); + } + if (data[d].endsWith("_exp")) { + hashExpression.put(data[d], null); + } + } + int itr = 0; + while ((str = in.readLine()) != null) { + if (!str.contains("NA")) { + data = str.split("\t"); + hashEQTLs.put(data[0], null); + snps.add(data[1]); + itr++; + if (itr % 100 == 0) { + System.out.println(itr); + } + } else { + ++countExcludedLines; + } + } + } catch (Exception e) { + System.out.println("Error:\t" + e.getMessage()); + e.printStackTrace(); + } + + System.out.println("EXCLUDED LINES: " + countExcludedLines); + + ExpressionDataset datasetGenotypes = new ExpressionDataset(inputDir + "/bigTableLude.txt", '\t', hashEQTLs, hashGenotypes); + datasetGenotypes.probeNames = snps.toArray(new String[snps.size()]); + datasetGenotypes.recalculateHashMaps(); + + ExpressionDataset datasetExpression = new ExpressionDataset(inputDir + "/bigTableLude.txt", '\t', hashEQTLs, hashExpression); + datasetGenotypes.save(datasetGenotypes.fileName + ".Genotypes.binary"); + datasetExpression.save(datasetGenotypes.fileName + ".Expression.binary"); + + ExpressionDataset datasetCovariates = new ExpressionDataset(inputDir + "/covariateTableLude.txt"); + datasetCovariates.save(datasetCovariates.fileName + ".Covariates.binary"); + System.exit(0); + + } + + public final String performInteractionAnalysis(String[] covsToCorrect, String[] covsToCorrect2, TextFile outputTopCovs, File snpsToSwapFile, HashMultimap qtlProbeSnpMultiMap, String[] covariatesToTest, HashMap hashSamples, int numThreads, final TIntHashSet snpsToTest, boolean skipNormalization, boolean skipCovariateNormalization, HashMultimap qtlProbeSnpMultiMapCovariates) throws IOException, Exception { + + //hashSamples = excludeOutliers(hashSamples); + + HashMap covariatesToLoad = new HashMap(); + if (covariatesToTest != null) { + for (String c : covariatesToTest) { + covariatesToLoad.put(c, null); + } + for (String c : covsToCorrect) { + covariatesToLoad.put(c, null); + } + for (String c : covsToCorrect2) { + covariatesToLoad.put(c, null); + } + for (int i = 1; i <= 50; ++i) { + covariatesToLoad.put("Comp" + i, null); + } + } else { + covariatesToLoad = null; + } + + ExpressionDataset datasetExpression = new ExpressionDataset(inputDir + "/bigTableLude.txt.Expression.binary", '\t', null, hashSamples); + ExpressionDataset datasetCovariates = new ExpressionDataset(inputDir + "/covariateTableLude.txt.Covariates.binary", '\t', covariatesToLoad, hashSamples); + + org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression regression = new org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression(); + int nrSamples = datasetGenotypes.nrSamples; + + correctDosageDirectionForQtl(snpsToSwapFile, datasetGenotypes, datasetExpression); + + if(!skipNormalization){ + correctExpressionData(covsToCorrect2, datasetGenotypes, datasetCovariates, datasetExpression); + } + + + + ExpressionDataset datasetCovariatesPCAForceNormal = new ExpressionDataset(inputDir + "/covariateTableLude.txt.Covariates.binary", '\t', covariatesToLoad, hashSamples); + + if(!skipNormalization && !skipCovariateNormalization){ + correctCovariateDataPCA(covsToCorrect2, covsToCorrect, datasetGenotypes, datasetCovariatesPCAForceNormal); + } + + + if (1 == 1) { + + if (!skipNormalization && !skipCovariateNormalization && covsToCorrect2.length != 0 && covsToCorrect.length != 0) { + correctCovariateData(covsToCorrect2, covsToCorrect, datasetGenotypes, datasetCovariates); + } + + if (!skipNormalization && !skipCovariateNormalization && !qtlProbeSnpMultiMapCovariates.isEmpty()) { + correctCovariatesForQtls(datasetCovariates, datasetGenotypes, qtlProbeSnpMultiMapCovariates); + } + + + if (1 == 2) { + saveCorrectedCovariates(datasetCovariates); + } + + if (1 == 2) { + icaCovariates(datasetCovariates); + } + if(!skipNormalization){ + forceNormalCovariates(datasetCovariates, datasetGenotypes); + } + + } + + ExpressionDataset datasetExpressionBeforeEQTLCorrection = new ExpressionDataset(datasetExpression.nrProbes, datasetExpression.nrSamples); + for (int p = 0; p < datasetExpression.nrProbes; p++) { + for (int s = 0; s < datasetExpression.nrSamples; s++) { + datasetExpressionBeforeEQTLCorrection.rawData[p][s] = datasetExpression.rawData[p][s]; + } + } + + if(!skipNormalization && covsToCorrect.length != 0){ + correctExpressionDataForInteractions(covsToCorrect, datasetCovariates, datasetGenotypes, nrSamples, datasetExpression, regression, qtlProbeSnpMultiMap); + } + + if(!skipNormalization){ + forceNormalExpressionData(datasetExpression); + } + + datasetExpression.save(outputDir + "/expressionDataRound_" + covsToCorrect.length + ".txt"); + datasetExpression.save(outputDir + "/expressionDataRound_" + covsToCorrect.length + ".binary"); + datasetCovariates.save(outputDir + "/covariateData_" + covsToCorrect.length + ".binary"); + + + + + if (1 == 1) { + + + + ExpressionDataset datasetZScores = new ExpressionDataset(datasetCovariates.nrProbes, datasetExpression.nrProbes); + datasetZScores.probeNames = datasetCovariates.probeNames; + + datasetZScores.sampleNames = new String[datasetGenotypes.probeNames.length]; + for (int i = 0; i < datasetGenotypes.probeNames.length; ++i) { + datasetZScores.sampleNames[i] = datasetGenotypes.probeNames[i] + datasetExpression.probeNames[i].substring(datasetExpression.probeNames[i].lastIndexOf('_')); + } + + datasetZScores.recalculateHashMaps(); + + SkippedInteractionWriter skippedWriter = new SkippedInteractionWriter(new File(outputDir + "/skippedInteractionsRound_" + covsToCorrect.length + ".txt")); + + java.util.concurrent.ExecutorService threadPool = Executors.newFixedThreadPool(numThreads); + CompletionService pool = new ExecutorCompletionService(threadPool); + int nrTasks = 0; + for (int cov = 0; cov < datasetCovariates.nrProbes; cov++) { + double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[cov]); + if (stdev > 0) { + PerformInteractionAnalysisPermutationTask task = new PerformInteractionAnalysisPermutationTask(datasetGenotypes, datasetExpression, datasetCovariates, datasetCovariatesPCAForceNormal, cov, skippedWriter, snpsToTest); + pool.submit(task); + nrTasks++; + } + } + + String maxChi2Cov = ""; + int maxChi2CovI = 0; + double maxChi2 = 0; + try { + // If gene annotation provided, for chi2sum calculation use only genes that are 1mb apart + //if (geneDistanceMap != null) { + for (int task = 0; task < nrTasks; task++) { + try { + //System.out.println("Waiting on thread for: " + datasetCovariates.probeNames[cov]); + DoubleArrayIntegerObject result = pool.take().get(); + int cov = result.intValue; + double chi2Sum = 0; + double[] covZ = datasetZScores.rawData[cov]; + for (int snp = 0; snp < datasetGenotypes.nrProbes; snp++) { + //if (genesFarAway(datasetZScores.sampleNames[snp], datasetZScores.probeNames[cov])) { + double z = result.doubleArray[snp]; + covZ[snp] = z; + if (!Double.isNaN(z)) { + chi2Sum += z * z; + } + //} + } + + if (chi2Sum > maxChi2 && !datasetCovariates.probeNames[cov].startsWith("Comp") && !datasetCovariates.probeNames[cov].equals("LLS") && !datasetCovariates.probeNames[cov].equals("LLdeep") && !datasetCovariates.probeNames[cov].equals("RS") && !datasetCovariates.probeNames[cov].equals("CODAM")) { + maxChi2 = chi2Sum; + maxChi2CovI = cov; + maxChi2Cov = datasetCovariates.probeNames[cov]; + } + //System.out.println(covsToCorrect.length + "\t" + cov + "\t" + datasetCovariates.probeNames[cov] + "\t" + chi2Sum); + if ((task + 1) % 512 == 0) { + System.out.println(task + 1 + " tasks processed"); + } + } catch (ExecutionException ex) { + Logger.getLogger(PerformInteractionAnalysisPermutationTask.class.getName()).log(Level.SEVERE, null, ex); + } + } + /*} //If gene annotation not provided, use all gene pairs + else { + for (int task = 0; task < nrTasks; task++) { + try { + DoubleArrayIntegerObject result = pool.take().get(); + int cov = result.intValue; + double chi2Sum = 0; + double[] covZ = datasetZScores.rawData[cov]; + for (int snp = 0; snp < datasetGenotypes.nrProbes; snp++) { + double z = result.doubleArray[snp]; + covZ[snp] = z; + if (!Double.isNaN(z)) { + chi2Sum += z * z; + } + } + if (chi2Sum > maxChi2) { + maxChi2 = chi2Sum; + maxChi2Cov = datasetCovariates.probeNames[cov]; + } + //System.out.println(covsToCorrect.length + "\t" + cov + "\t" + datasetCovariates.probeNames[cov] + "\t" + chi2Sum); + if ((task + 1) % 512 == 0) { + System.out.println(task + 1 + " tasks processed"); + } + } catch (ExecutionException ex) { + Logger.getLogger(PerformInteractionAnalysisPermutationTask.class.getName()).log(Level.SEVERE, null, ex); + } + } + }*/ + threadPool.shutdown(); + } catch (Exception e) { + e.printStackTrace(); + System.out.println(e.getMessage()); + } + + System.out.println("Top covariate:\t" + maxChi2 + "\t" + maxChi2Cov); + outputTopCovs.writeln("Top covariate:\t" + maxChi2 + "\t" + maxChi2Cov); + outputTopCovs.flush(); + skippedWriter.close(); + datasetZScores.save(outputDir + "/InteractionZScoresMatrix-" + covsToCorrect.length + "Covariates.txt"); + + BufferedWriter writer = new BufferedWriter(new FileWriter(outputDir + "/" + "topCov" + maxChi2Cov + "_expression.txt")); + double[] topCovExpression = datasetCovariates.rawData[maxChi2CovI]; + for (int i = 0; i < topCovExpression.length; ++i) { + writer.append(datasetCovariates.sampleNames[i]); + writer.append('\t'); + writer.append(String.valueOf(topCovExpression[i])); + writer.append('\n'); + } + writer.close(); + + return maxChi2Cov; + } + + return null; + } + + /** + * Creates a map of gene name to GenomicBoundary containing gene coordinates + * and the coordinate of its midpoint as annotation + * + * @param annotFname - path to the annotation file (in the + * eqtlmappingpipeline format) + * @throws IOException + */ + private void createGeneDistanceMap(String annotFname) throws IOException { + System.out.println("Creating a gene distance map from " + annotFname); + + geneDistanceMap = new HashMap>(); + + TextFile annotFile = new TextFile(annotFname, false); + String els[] = annotFile.readLineElems(TextFile.tab); + + while ((els = annotFile.readLineElems(TextFile.tab)) != null) { + int start = Integer.parseInt(els[4]), end = Integer.parseInt(els[5]), middle = start + (end - start) / 2; + GenomicBoundary genomicboundary = new GenomicBoundary(els[3], Integer.parseInt(els[4]), Integer.parseInt(els[5]), middle); + geneDistanceMap.put(els[1], genomicboundary); + } + annotFile.close(); + } + + /** + * Checks if the genomic distance between 2 genes is more than 1mb + * + * @param gene1 + * @param gene2 + * @return true if the genes are more than 1mb apart + */ + public boolean genesFarAway(String gene1, String gene2) { + // if one of the covariates is a technical bias or a cell count etc + if ((!gene1.startsWith("ENS")) || (!gene2.startsWith("ENS"))) { + return true; + } + + GenomicBoundary gb1 = null, gb2 = null; + try { + gb1 = geneDistanceMap.get(gene1); + gb2 = geneDistanceMap.get(gene2); + + if (gb1.getChromosome() != gb2.getChromosome()) { + return true; + } + if (Math.abs(gb1.getAnnotation() - gb2.getAnnotation()) > 1000000) { + return true; + } + } catch (Exception e) { + System.out.println("Error: gene annotation doesn't contain one of these genes: " + gene1 + " or " + gene2); + System.exit(1); + } + return false; + } + + static public void orthogonalizeDataset(String inputFile) { + + ExpressionDataset dataset = new ExpressionDataset(inputFile); + dataset.transposeDataset(); + dataset.standardNormalizeData(); + int nrVars = dataset.nrProbes; + int nrSamples = dataset.nrSamples; + + double[][] matrix = new double[nrVars][nrSamples]; + for (int s = 0; s < nrVars; s++) { + for (int sample = 0; sample < nrSamples; sample++) { + matrix[s][sample] = dataset.rawData[s][sample]; + } + } + double[][] correlationMatrix = new double[nrVars][nrVars]; + for (int p = 0; p < nrVars; p++) { + correlationMatrix[p][p] = 1d; + for (int q = p + 1; q < nrVars; q++) { + double covariance = 0; + for (int sample = 0; sample < nrSamples; sample++) { + covariance += matrix[p][sample] * matrix[q][sample]; + } + covariance /= (double) (nrSamples - 1); + correlationMatrix[p][q] = covariance; + correlationMatrix[q][p] = covariance; + } + } + Jama.EigenvalueDecomposition eig = eigenValueDecomposition(correlationMatrix); + double[] eigenValues = eig.getRealEigenvalues(); + + double[][] eigenVectors = new double[correlationMatrix.length][correlationMatrix.length]; + ExpressionDataset datasetEigenvectors = new ExpressionDataset(correlationMatrix.length, correlationMatrix.length); + ExpressionDataset datasetEigenvalues = new ExpressionDataset(correlationMatrix.length, 2); + for (int pca = 0; pca < correlationMatrix.length; pca++) { + datasetEigenvectors.probeNames[pca] = "Comp" + (pca + 1); + datasetEigenvalues.probeNames[pca] = "Comp" + (pca + 1); + datasetEigenvectors.sampleNames[pca] = dataset.probeNames[pca]; + } + datasetEigenvalues.sampleNames[0] = "Eigenvalues"; + datasetEigenvalues.sampleNames[1] = "ExplainedVariance"; + for (int pca = 0; pca < correlationMatrix.length; pca++) { + datasetEigenvectors.rawData[pca] = getEigenVector(eig, pca); + datasetEigenvalues.rawData[pca][0] = eigenValues[eigenValues.length - 1 - pca]; + datasetEigenvalues.rawData[pca][1] = getEigenValueVar(eigenValues, pca); + System.out.println(pca + "\tExplainedVariance:\t" + getEigenValueVar(eigenValues, pca) + "\tEigenvalue:\t" + eigenValues[eigenValues.length - 1 - pca]); + } + datasetEigenvectors.transposeDataset(); + datasetEigenvectors.save(inputFile + ".Eigenvectors.txt"); + datasetEigenvalues.save(inputFile + ".Eigenvalues.txt"); + + //Calculate principal components: + ExpressionDataset datasetPCs = new ExpressionDataset(dataset.nrSamples, correlationMatrix.length); + for (int pca = 0; pca < correlationMatrix.length; pca++) { + datasetPCs.sampleNames[pca] = "Comp" + (pca + 1); + } + for (int p = 0; p < datasetPCs.nrProbes; p++) { + datasetPCs.probeNames[p] = dataset.sampleNames[p]; + } + for (int pca = 0; pca < correlationMatrix.length; pca++) { + for (int p = 0; p < dataset.nrProbes; p++) { + for (int s = 0; s < dataset.nrSamples; s++) { + datasetPCs.rawData[s][pca] += datasetEigenvectors.rawData[p][pca] * dataset.rawData[p][s]; + } + } + } + datasetPCs.save(dataset.fileName + ".PrincipalComponents.txt"); + + ExpressionDataset datasetFactorloadings = new ExpressionDataset(correlationMatrix.length, correlationMatrix.length); + datasetPCs.transposeDataset(); + for (int p = 0; p < dataset.nrProbes; p++) { + datasetFactorloadings.probeNames[p] = dataset.probeNames[p]; + } + for (int pca = 0; pca < datasetPCs.nrProbes; pca++) { + datasetFactorloadings.sampleNames[pca] = "Comp" + (pca + 1); + for (int p = 0; p < dataset.nrProbes; p++) { + datasetFactorloadings.rawData[p][pca] = JSci.maths.ArrayMath.correlation(datasetPCs.rawData[pca], dataset.rawData[p]); + } + } + datasetFactorloadings.save(dataset.fileName + ".Factorloadings.txt"); + + } + + static public ExpressionDataset orthogonalizeMatrix(ExpressionDataset dataset) { + + dataset.standardNormalizeData(); + int nrVars = dataset.nrProbes; + int nrSamples = dataset.nrSamples; + double[][] matrix = new double[nrVars][nrSamples]; + for (int s = 0; s < nrVars; s++) { + for (int sample = 0; sample < nrSamples; sample++) { + matrix[s][sample] = dataset.rawData[s][sample]; + } + } + double[][] correlationMatrix = new double[nrVars][nrVars]; + for (int p = 0; p < nrVars; p++) { + correlationMatrix[p][p] = 1d; + for (int q = p + 1; q < nrVars; q++) { + double covariance = 0; + for (int sample = 0; sample < nrSamples; sample++) { + covariance += matrix[p][sample] * matrix[q][sample]; + } + covariance /= (double) (nrSamples - 1); + if (covariance > 1) { + covariance = 1d; + } + if (covariance < -1) { + covariance = -1d; + } + correlationMatrix[p][q] = covariance; + correlationMatrix[q][p] = covariance; + } + } + Jama.EigenvalueDecomposition eig = eigenValueDecomposition(correlationMatrix); + double[] eigenValues = eig.getRealEigenvalues(); + int nrCompsWithPositiveEigenvalues = 0; + for (int e = 0; e < eigenValues.length; e++) { + //System.out.println(e + "\t" + eigenValues[e]); + if (eigenValues[e] > 1e-10) { + nrCompsWithPositiveEigenvalues++; + } + } + + ExpressionDataset datasetEigenvectors = new ExpressionDataset(correlationMatrix.length, correlationMatrix.length); + for (int pca = 0; pca < correlationMatrix.length; pca++) { + datasetEigenvectors.rawData[pca] = getEigenVector(eig, pca); + } + datasetEigenvectors.transposeDataset(); + + //Calculate principal components: + ExpressionDataset datasetPCs = new ExpressionDataset(dataset.nrSamples, nrCompsWithPositiveEigenvalues); + for (int pca = 0; pca < nrCompsWithPositiveEigenvalues; pca++) { + datasetPCs.sampleNames[pca] = "Comp" + (pca + 1); + } + for (int p = 0; p < datasetPCs.nrProbes; p++) { + datasetPCs.probeNames[p] = dataset.sampleNames[p]; + } + for (int pca = 0; pca < nrCompsWithPositiveEigenvalues; pca++) { + for (int p = 0; p < dataset.nrProbes; p++) { + for (int s = 0; s < dataset.nrSamples; s++) { + datasetPCs.rawData[s][pca] += datasetEigenvectors.rawData[p][pca] * dataset.rawData[p][s]; + } + } + } + datasetPCs.transposeDataset(); + return datasetPCs; + + } + + static public double[] getLinearRegressionCoefficients(double[] xVal, double[] yVal) { + double n = (double) xVal.length; + double sumX = 0; + double sumXX = 0; + double sumY = 0; + double sumXY = 0; + for (int x = 0; x < xVal.length; x++) { + sumX += xVal[x]; + sumXX += xVal[x] * xVal[x]; + sumY += yVal[x]; + sumXY += xVal[x] * yVal[x]; + } + double sXX = sumXX - sumX * sumX / n; + double sXY = sumXY - sumX * sumY / n; + double a = sXY / sXX; + double b = (sumY - a * sumX) / n; + double[] regressionCoefficients = new double[2]; + regressionCoefficients[0] = a; + regressionCoefficients[1] = b; + return regressionCoefficients; + } + + static public Jama.EigenvalueDecomposition eigenValueDecomposition(double[][] data) { + Jama.Matrix m = new Jama.Matrix(data); + Jama.EigenvalueDecomposition eig = m.eig(); + return eig; + } + + static public double[] getEigenVector(Jama.EigenvalueDecomposition eig, double[] eigenValues, int pca) { + Jama.Matrix eigenValueMatrix = eig.getV(); + double[][] eigenValueMat = eigenValueMatrix.getArray(); + double[] eigenVector = new double[eigenValueMat.length]; + for (int i = 0; i < eigenValueMat.length; i++) { + eigenVector[i] = eigenValueMat[i][eigenValueMat.length - 1 - pca]; // * Math.sqrt(eigenValues[eigenValues.length - 1 - pca]); + } + return eigenVector; + } + + static public double[] getEigenVector(Jama.EigenvalueDecomposition eig, int pca) { + Jama.Matrix eigenValueMatrix = eig.getV(); + double[][] eigenValueMat = eigenValueMatrix.getArray(); + double[] eigenVector = new double[eigenValueMat.length]; + for (int i = 0; i < eigenValueMat.length; i++) { + eigenVector[i] = eigenValueMat[i][eigenValueMat.length - 1 - pca]; // * Math.sqrt(eigenValues[eigenValues.length - 1 - pca]); + } + return eigenVector; + } + + static public double getEigenValueVar(double[] eigenValues, int pca) { + double sumEigenvalues = 0.0; + for (Double d : eigenValues) { + sumEigenvalues += Math.abs(d); + } + double result = eigenValues[eigenValues.length - 1 - pca] / sumEigenvalues; + return result; + } + + static public double[] getEigenVectorSVD(Jama.SingularValueDecomposition svd, double[] singularValues, int pca) { + Jama.Matrix eigenValueMatrix = svd.getV(); + double[][] eigenValueMat = eigenValueMatrix.getArray(); + double[] eigenVector = new double[eigenValueMat.length]; + for (int i = 0; i < eigenValueMat.length; i++) { + eigenVector[i] = eigenValueMat[i][pca] * Math.sqrt(singularValues[pca]); + } + return eigenVector; + } + + private void correctCovariatesForQtls(ExpressionDataset datasetCovariates, ExpressionDataset datasetGenotypes, HashMultimap qtlProbeSnpMultiMap) throws Exception { + + System.out.println("Correcting covariate data for cis-eQTL effects:"); + + OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression(); + + HashMap snpMap = new HashMap(datasetGenotypes.nrProbes); + for (Map.Entry snpEntry : datasetGenotypes.hashProbes.entrySet()) { + try { + snpMap.put(snpEntry.getKey().substring(0, snpEntry.getKey().indexOf('_')), snpEntry.getValue()); + } catch (Exception e) { + System.out.println(snpEntry.getKey()); + throw e; + } + } + + for (int p = 0; p < datasetCovariates.nrProbes; p++) { + + String probe = datasetCovariates.probeNames[p]; + Set probeQtls = qtlProbeSnpMultiMap.get(probe); + +// System.out.println(""); +// System.out.println("-------------------------------------"); +// System.out.println(""); +// System.out.println(probe + " with " + probeQtls.size() + " SNPs"); +// System.out.println(""); + + + if (!probeQtls.isEmpty()) { + + int snpsInData = 0; + for (String snp : probeQtls) { + + Integer s = snpMap.get(snp); + if (s != null) { + ++snpsInData; + } + + } + + double[][] x = new double[datasetCovariates.nrSamples][snpsInData]; + + int k = 0; + for (String snp : probeQtls) { + + Integer s = snpMap.get(snp); + if (s == null) { + continue; + //throw new Exception("Snp " + snp + " not found"); + } + double[] snpData = datasetGenotypes.rawData[s]; + for (int i = 0; i < datasetGenotypes.nrSamples; ++i) { + x[i][k] = snpData[i]; + } + + k++; + } + + + +// PearsonsCorrelation cor = new PearsonsCorrelation(); + +// System.out.println("Before"); +// for(String snp : probeQtls){ +// Integer s = snpMap.get(snp); +// System.out.println(snp + " - " + cor.correlation(datasetCovariates.rawData[p], datasetGenotypes.rawData[s])); +// } + + ols.newSampleData(datasetCovariates.rawData[p], x); + datasetCovariates.rawData[p] = ols.estimateResiduals(); + +// System.out.println("After"); +// for(String snp : probeQtls){ +// Integer s = snpMap.get(snp); +// System.out.println(snp + " - " + cor.correlation(datasetCovariates.rawData[p], datasetGenotypes.rawData[s])); +// } + + } + + } + + + + +// for (int p = 0; p < datasetCovariates.nrProbes; p++) { +// if (datasetExpression.hashProbes.containsKey(datasetCovariates.probeNames[p])) { +// int index = ((Integer) datasetExpression.hashProbes.get(datasetCovariates.probeNames[p])).intValue(); +// double[] rc = getLinearRegressionCoefficients(datasetGenotypes.rawData[index], datasetCovariates.rawData[p]); +// for (int s = 0; s < datasetGenotypes.nrSamples; s++) { +// datasetCovariates.rawData[p][s] -= rc[0] * datasetGenotypes.rawData[index][s]; +// } +// } +// } + + } + + private HashMap excludeOutliers(HashMap hashSamples) { + System.out.println("Removing outlier samples!!!"); + HashMap hashCovariates = new HashMap(); + hashCovariates.put("MEDIAN_5PRIME_BIAS", null); + hashCovariates.put("MEDIAN_3PRIME_BIAS", null); + ExpressionDataset datasetCovariates = new ExpressionDataset(inputDir + "/covariateTableLude.txt.Covariates.binary", '\t', hashCovariates, null); + hashSamples = new HashMap(); + for (int s = 0; s < datasetCovariates.nrSamples; s++) { + if (datasetCovariates.rawData[0][s] != 0) { + hashSamples.put(datasetCovariates.sampleNames[s], null); + } + } + datasetCovariates = new ExpressionDataset(inputDir + "/covariateTableLude.txt.Covariates.binary", '\t', hashCovariates, hashSamples); + HashMap hashSamplesToExclude = new HashMap(); + if (1 == 1) { + int index = ((Integer) datasetCovariates.hashProbes.get("MEDIAN_5PRIME_BIAS")).intValue(); + double mean = JSci.maths.ArrayMath.mean(datasetCovariates.rawData[index]); + double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[index]); + for (int s = 0; s < datasetCovariates.nrSamples; s++) { + double z = (datasetCovariates.rawData[index][s] - mean) / stdev; + if (Math.abs(z) > 3) { + hashSamplesToExclude.put(datasetCovariates.sampleNames[s], null); + } + } + } + if (1 == 1) { + int index = ((Integer) datasetCovariates.hashProbes.get("MEDIAN_3PRIME_BIAS")).intValue(); + double mean = JSci.maths.ArrayMath.mean(datasetCovariates.rawData[index]); + double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[index]); + for (int s = 0; s < datasetCovariates.nrSamples; s++) { + double z = (datasetCovariates.rawData[index][s] - mean) / stdev; + if (Math.abs(z) > 3) { + hashSamplesToExclude.put(datasetCovariates.sampleNames[s], null); + } + } + } + hashSamples = new HashMap(); + for (int s = 0; s < datasetCovariates.nrSamples; s++) { + if (!hashSamplesToExclude.containsKey(datasetCovariates.sampleNames[s])) { + hashSamples.put(datasetCovariates.sampleNames[s], null); + hashSamples.put(datasetCovariates.sampleNames[s] + "_exp", null); + hashSamples.put(datasetCovariates.sampleNames[s] + "_dosage", null); + } + } + return hashSamples; + } + + private void correctCovariateData(String[] covsToCorrect2, String[] covsToCorrect, ExpressionDataset datasetGenotypes, ExpressionDataset datasetCovariates) throws Exception { + + System.out.println("Correcting covariate data for cohort specific effects:"); +// String[] cohorts = {"LLDeep","LLS","RS","CODAM"}; + ExpressionDataset datasetCovariatesToCorrectFor = new ExpressionDataset(covsToCorrect2.length + covsToCorrect.length, datasetGenotypes.nrSamples); + datasetCovariatesToCorrectFor.sampleNames = datasetGenotypes.sampleNames; + + HashMap hashCovsToCorrect = new HashMap(); + + for (int i = 0; i < covsToCorrect2.length; ++i) { + String cov = covsToCorrect2[i]; + hashCovsToCorrect.put(cov, null); + Integer c = datasetCovariates.hashProbes.get(cov); + if (c == null) { + throw new Exception("Covariate not found: " + cov); + } + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetCovariatesToCorrectFor.rawData[i][s] = datasetCovariates.rawData[c][s]; + } + } + +// for (int p=0; p 1E-5) { + double[] rc = getLinearRegressionCoefficients(datasetCovariatesToCorrectFor.rawData[cov], datasetCovariates.rawData[p]); + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetCovariates.rawData[p][s] -= rc[0] * datasetCovariatesToCorrectFor.rawData[cov][s]; + } + } + } + double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[p]); + double mean = JSci.maths.ArrayMath.mean(datasetCovariates.rawData[p]); + if (stdev < 1E-5) { + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetCovariates.rawData[p][s] = mean; + } + } + } + } + } + + private ExpressionDataset correctCovariateDataPCA(String[] covsToCorrect2, String[] covsToCorrect, ExpressionDataset datasetGenotypes, ExpressionDataset datasetCovariatesPCAForceNormal) throws Exception { + + int nrCompsToCorrectFor = 25; + + System.out.println("Preparing data for testing eQTL effects of SNPs on covariate data:"); + System.out.println("Correcting covariate data for cohort specific effects:"); + + ExpressionDataset datasetCovariatesToCorrectFor = new ExpressionDataset(covsToCorrect2.length + covsToCorrect.length + nrCompsToCorrectFor, datasetGenotypes.nrSamples); + datasetCovariatesToCorrectFor.sampleNames = datasetGenotypes.sampleNames; + + // add covariates from the first list + HashMap hashCovsToCorrect = new HashMap(); + + // add covariates from the second list + for (int i = 0; i < covsToCorrect2.length; ++i) { + String cov = covsToCorrect2[i]; + hashCovsToCorrect.put(cov, null); + Integer c = datasetCovariatesPCAForceNormal.hashProbes.get(cov); + if (c == null) { + throw new Exception("Covariate not found: " + cov); + } + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetCovariatesToCorrectFor.rawData[i][s] = datasetCovariatesPCAForceNormal.rawData[c][s]; + } + } + + int[] covsToCorrectIndex = new int[covsToCorrect.length]; + for (int c = 0; c < covsToCorrect.length; c++) { + hashCovsToCorrect.put(covsToCorrect[c], null); + covsToCorrectIndex[c] = ((Integer) datasetCovariatesPCAForceNormal.hashProbes.get(covsToCorrect[c])).intValue(); + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetCovariatesToCorrectFor.rawData[covsToCorrect2.length + c][s] = datasetCovariatesPCAForceNormal.rawData[covsToCorrectIndex[c]][s]; + } + } + + // add PCs + if (nrCompsToCorrectFor > 0) { + for (int comp = 0; comp < nrCompsToCorrectFor; comp++) { + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetCovariatesToCorrectFor.rawData[covsToCorrect2.length + covsToCorrect.length + comp][s] = datasetCovariatesPCAForceNormal.rawData[datasetCovariatesPCAForceNormal.nrProbes - 51 + comp][s]; + } + } + } + + datasetCovariatesToCorrectFor.transposeDataset(); + + datasetCovariatesToCorrectFor.save(inputDir + "/CovariatesToCorrectFor.txt"); + orthogonalizeDataset(inputDir + "/CovariatesToCorrectFor.txt"); + datasetCovariatesToCorrectFor = new ExpressionDataset(inputDir + "/CovariatesToCorrectFor.txt.PrincipalComponents.txt"); + datasetCovariatesToCorrectFor.transposeDataset(); + ExpressionDataset datasetCovariatesToCorrectForEigenvalues = new ExpressionDataset(inputDir + "/CovariatesToCorrectFor.txt.Eigenvalues.txt"); + + for (int p = 0; p < datasetCovariatesPCAForceNormal.nrProbes; p++) { + if (!hashCovsToCorrect.containsKey(datasetCovariatesPCAForceNormal.probeNames[p])) { + for (int cov = 0; cov < datasetCovariatesToCorrectFor.nrProbes; cov++) { + if (datasetCovariatesToCorrectForEigenvalues.rawData[cov][0] > 1E-5) { + double[] rc = getLinearRegressionCoefficients(datasetCovariatesToCorrectFor.rawData[cov], datasetCovariatesPCAForceNormal.rawData[p]); + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetCovariatesPCAForceNormal.rawData[p][s] -= rc[0] * datasetCovariatesToCorrectFor.rawData[cov][s]; + } + } + } + /*double stdev = JSci.maths.ArrayMath.standardDeviation(datasetCovariates.rawData[p]); + double mean = JSci.maths.ArrayMath.mean(datasetCovariates.rawData[p]); + if (stdev < 1E-5) { + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetCovariatesPCAForceNormal.rawData[p][s] = mean; + } + }*/ + } + } + + System.out.println("Enforcing normal distribution on covariates"); + + NaturalRanking ranker = new NaturalRanking(); + + for (int p = 0; p < datasetCovariatesPCAForceNormal.nrProbes; p++) { + //Rank order the expression values: + double[] values = new double[datasetCovariatesPCAForceNormal.nrSamples]; + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + values[s] = datasetCovariatesPCAForceNormal.rawData[p][s]; + } + double[] rankedValues = ranker.rank(values); + //Replace the original expression value with the standard distribution enforce: + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + //Convert the rank to a proportion, with range <0, 1> + double pValue = (0.5d + rankedValues[s] - 1d) / (double) (rankedValues.length); + //Convert the pValue to a Z-Score: + double zScore = cern.jet.stat.tdouble.Probability.normalInverse(pValue); + datasetCovariatesPCAForceNormal.rawData[p][s] = zScore; //Replace original expression value with the Z-Score + } + } + return datasetCovariatesPCAForceNormal; + } + + private void correctExpressionData(String[] covsToCorrect2, ExpressionDataset datasetGenotypes, ExpressionDataset datasetCovariates, ExpressionDataset datasetExpression) throws Exception { + //Define a set of covariates that we want to use as correction: + System.out.println("Correcting gene expression data for cohort specific effects and top 25 components"); + //String[] cohorts = {"LLDeep", "LLS", "RS", "CODAM"}; + int nrCompsToCorrectFor = 25; + ExpressionDataset datasetCovariatesToCorrectFor = new ExpressionDataset(covsToCorrect2.length + nrCompsToCorrectFor, datasetGenotypes.nrSamples); + datasetCovariatesToCorrectFor.sampleNames = datasetGenotypes.sampleNames; + + for (int i = 0; i < covsToCorrect2.length; ++i) { + String cov = covsToCorrect2[i]; + Integer c = datasetCovariates.hashProbes.get(cov); + if (c == null) { + throw new Exception("Covariate not found: " + cov); + } + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetCovariatesToCorrectFor.rawData[i][s] = datasetCovariates.rawData[c][s]; + } + } + +// for (int p = 0; p < cohorts.length; p++) { +// for (int s = 0; s < datasetGenotypes.nrSamples; s++) { +// if (datasetGenotypes.sampleNames[s].startsWith(cohorts[p])) { +// datasetCovariatesToCorrectFor.rawData[p][s] = 1; +// } +// } +// } + if (nrCompsToCorrectFor > 0) { + for (int comp = 0; comp < nrCompsToCorrectFor; comp++) { + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetCovariatesToCorrectFor.rawData[covsToCorrect2.length + comp][s] = datasetCovariates.rawData[datasetCovariates.nrProbes - 51 + comp][s]; + } + } + } + + datasetCovariatesToCorrectFor.transposeDataset(); + + datasetCovariatesToCorrectFor.save(outputDir + "/CovariatesToCorrectFor.txt"); + orthogonalizeDataset(outputDir + "/CovariatesToCorrectFor.txt"); + datasetCovariatesToCorrectFor = new ExpressionDataset(outputDir + "/CovariatesToCorrectFor.txt.PrincipalComponents.txt"); + datasetCovariatesToCorrectFor.transposeDataset(); + ExpressionDataset datasetCovariatesToCorrectForEigenvalues = new ExpressionDataset(outputDir + "/CovariatesToCorrectFor.txt.Eigenvalues.txt"); + for (int snp = 0; snp < datasetExpression.nrProbes; snp++) { + for (int cov = 0; cov < datasetCovariatesToCorrectFor.nrProbes; cov++) { + if (datasetCovariatesToCorrectForEigenvalues.rawData[cov][0] > 1E-5) { + double[] rc = getLinearRegressionCoefficients(datasetCovariatesToCorrectFor.rawData[cov], datasetExpression.rawData[snp]); + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetExpression.rawData[snp][s] -= rc[0] * datasetCovariatesToCorrectFor.rawData[cov][s]; + } + } + } + } + } + + private void correctDosageDirectionForQtl(File snpsToSwapFile, ExpressionDataset datasetGenotypes, ExpressionDataset datasetExpression) throws IOException { + //double[] mainEQTLCorr = new double[datasetGenotypes.nrProbes]; + + + if (snpsToSwapFile != null) { + System.out.println("Enforcing for every eQTL that the genotype dosage is swapped based on: " + snpsToSwapFile.getAbsolutePath()); + + HashSet snpsToSwap = new HashSet(); + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(snpsToSwapFile), "UTF-8")); + String line; + while ((line = reader.readLine()) != null) { + snpsToSwap.add(line); + } + reader.close(); + + for (int snp = 0; snp < datasetGenotypes.nrProbes; snp++) { + + if (snpsToSwap.contains(datasetGenotypes.probeNames[snp])) { + + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetGenotypes.rawData[snp][s] = 2 - datasetGenotypes.rawData[snp][s]; + } + + } + + //mainEQTLCorr[snp] = corr; + } + + + } else { + System.out.println("Enforcing for every eQTL that the genotype dosage positively correlated with gene expression levels:"); + + Writer writer = new BufferedWriter(new FileWriter(outputDir + "/swappedDosages.txt")); + for (int snp = 0; snp < datasetGenotypes.nrProbes; snp++) { + double corr = JSci.maths.ArrayMath.correlation(datasetGenotypes.rawData[snp], datasetExpression.rawData[snp]); + //System.out.println(datasetExpression.probeNames[snp] + "\t" + snp + "\t" + corr); + + if (corr < 0) { + corr = -corr; + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + datasetGenotypes.rawData[snp][s] = 2 - datasetGenotypes.rawData[snp][s]; + } + writer.append(datasetGenotypes.probeNames[snp]); + writer.append('\n'); + } + + //mainEQTLCorr[snp] = corr; + } + writer.close(); + + } + } + + private void saveCorrectedCovariates(ExpressionDataset datasetCovariates) { + datasetCovariates.save(inputDir + "/CovariatesCorrected.txt"); + HashMap hashProbesToFilter = new HashMap(); + for (int p = 0; p < datasetCovariates.nrProbes; p++) { + if (datasetCovariates.probeNames[p].startsWith("ENSG")) { + hashProbesToFilter.put(datasetCovariates.probeNames[p], null); + } + } + ExpressionDataset datasetCovariatesCorrected = new ExpressionDataset(inputDir + "/CovariatesCorrected.txt", '\t', hashProbesToFilter, null); + datasetCovariatesCorrected.transposeDataset(); + datasetCovariatesCorrected.save(inputDir + "/CovariatesCorrected.txt"); + System.exit(0); + } + + private void icaCovariates(ExpressionDataset datasetCovariates) { + ExpressionDataset datasetICA = new ExpressionDataset("/Users/lude/Documents/ICA/mixingmatrix.txt"); + //ExpressionDataset datasetICA = new ExpressionDataset("/Users/lude/Documents/ICA/signals.txt"); + datasetICA.transposeDataset(); + for (int p = 0; p < datasetICA.nrProbes; p++) { + datasetCovariates.rawData[p] = datasetICA.rawData[p]; + datasetCovariates.probeNames[p] = datasetICA.probeNames[p]; + if (p == 7) { + for (int q = 0; q < datasetCovariates.nrProbes; q++) { + double corr = JSci.maths.ArrayMath.correlation(datasetICA.rawData[p], datasetCovariates.rawData[q]); + System.out.println(p + "\t" + datasetICA.probeNames[p] + "\t" + q + "\t" + datasetCovariates.probeNames[q] + "\t" + corr + "\t" + corr * corr); + } + } + } + + orthogonalizeDataset("/Users/lude/Documents/ICA/mixingmatrix.txt"); + //System.exit(0); + } + + private void forceNormalCovariates(ExpressionDataset datasetCovariates, ExpressionDataset datasetGenotypes) throws ArithmeticException { + System.out.println("Enforcing normal distribution on covariates"); + + NaturalRanking ranker = new NaturalRanking(); + + for (int p = 0; p < datasetCovariates.nrProbes; p++) { + //Rank order the expression values: + double[] values = new double[datasetCovariates.nrSamples]; + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + values[s] = datasetCovariates.rawData[p][s]; + } + double[] rankedValues = ranker.rank(values); + //Replace the original expression value with the standard distribution enforce: + for (int s = 0; s < datasetGenotypes.nrSamples; s++) { + //Convert the rank to a proportion, with range <0, 1> + double pValue = (0.5d + rankedValues[s] - 1d) / (double) (rankedValues.length); + //Convert the pValue to a Z-Score: + double zScore = cern.jet.stat.tdouble.Probability.normalInverse(pValue); + datasetCovariates.rawData[p][s] = zScore; //Replace original expression value with the Z-Score + } + } + } + + private void correctExpressionDataForInteractions(String[] covsToCorrect, ExpressionDataset datasetCovariates, ExpressionDataset datasetGenotypes, int nrSamples, ExpressionDataset datasetExpression, OLSMultipleLinearRegression regression, HashMultimap qtlProbeSnpMultiMap) throws MathIllegalArgumentException, Exception { + + System.out.println("Correcting expression data for predefined gene environment interaction effects: " + Arrays.toString(covsToCorrect)); + int[] covsToCorrectIndex = new int[covsToCorrect.length]; + for (int c = 0; c < covsToCorrect.length; c++) { + covsToCorrectIndex[c] = ((Integer) datasetCovariates.hashProbes.get(covsToCorrect[c])).intValue(); + + } + + HashMap snpMap = new HashMap(datasetGenotypes.nrProbes); + for (Map.Entry snpEntry : datasetGenotypes.hashProbes.entrySet()) { + try { + snpMap.put(snpEntry.getKey().substring(0, snpEntry.getKey().indexOf('_')), snpEntry.getValue()); + } catch (Exception e) { + System.out.println(snpEntry.getKey()); + throw e; + } + } + + Variance v = new Variance(); + + for (int p = 0; p < datasetExpression.nrProbes; p++) { + + String probe = datasetExpression.probeNames[p].substring(0, datasetExpression.probeNames[p].lastIndexOf('_')); + Set probeQtls = qtlProbeSnpMultiMap.get(probe); + + if (probeQtls.isEmpty()) { + throw new Exception("No eQTLs found for: " + probe); + } + + int snpsInData = 0; + HashSet excludedSnps = new HashSet(); + for (String snp : probeQtls) { + + Integer s = snpMap.get(snp); + if (s != null) { + + + + if (v.evaluate(datasetGenotypes.rawData[s]) > 0) { + ++snpsInData; + } else { + excludedSnps.add(snp); + } + + } + + + + } + + //boolean foundPisS = false; + double[][] valsX = new double[nrSamples][snpsInData + covsToCorrect.length * 2]; //store genotypes, covariates, interactions + int k = 0; + for (String snp : probeQtls) { + + if (excludedSnps.contains(snp)) { + continue; + } + + Integer s = snpMap.get(snp); + if (s == null) { + //throw new Exception("Snp " + snp + " not found"); + continue; + } +// if(s.intValue() == p){ +// foundPisS = true; +// } + double[] snpData = datasetGenotypes.rawData[s]; + for (int i = 0; i < datasetGenotypes.nrSamples; ++i) { + valsX[i][k] = snpData[i]; + } + + k++; + } +// if(!foundPisS){ +// +// System.out.println("Expected snp: " + datasetGenotypes.probeNames[p] + " at index: " + p); +// +// for(String qtlSnp : probeQtls = qtlProbeSnpMultiMap.get(probe)){ +// System.out.println("QTL snp: " + qtlSnp + " found at index: " + snpMap.get(qtlSnp)); +// } +// +// throw new Exception("Error 2"); +// } + for (int c = 0; c < covsToCorrect.length; c++) { + double[] covData = datasetCovariates.rawData[covsToCorrectIndex[c]]; + double[] snpData = datasetGenotypes.rawData[p]; + + for (int s = 0; s < nrSamples; s++) { + valsX[s][c * 2 + snpsInData] = covData[s]; //covariate + valsX[s][c * 2 + snpsInData + 1] = snpData[s] * covData[s]; //interction + } + } + double[] valsY = datasetExpression.rawData[p]; + regression.newSampleData(valsY, valsX); + try { + datasetExpression.rawData[p] = regression.estimateResiduals(); + } catch (Exception up) { + System.err.println("Error correcting for interactions: " + probe + " - " + datasetGenotypes.probeNames[p]); + } + } + } + + private void forceNormalExpressionData(ExpressionDataset datasetExpression) throws ArithmeticException { + System.out.println("Enforcing normal distribution on expression data:"); + + NaturalRanking ranker = new NaturalRanking(); + + for (int p = 0; p < datasetExpression.nrProbes; p++) { + //Rank order the expression values: + double[] values = new double[datasetExpression.nrSamples]; + for (int s = 0; s < datasetExpression.nrSamples; s++) { + values[s] = datasetExpression.rawData[p][s]; + } + + double[] rankedValues = ranker.rank(values); + //Replace the original expression value with the standard distribution enforce: + for (int s = 0; s < datasetExpression.nrSamples; s++) { + //Convert the rank to a proportion, with range <0, 1> + double pValue = (0.5d + rankedValues[s] - 1d) / (double) (rankedValues.length); + //Convert the pValue to a Z-Score: + double zScore = cern.jet.stat.tdouble.Probability.normalInverse(pValue); + datasetExpression.rawData[p][s] = zScore; //Replace original expression value with the Z-Score + } + } + + System.out.println("Expression data now force normal"); + } + + private HashMap readEnsgAnnotations(File ensgAnnotationFile) throws FileNotFoundException, IOException { + final HashMap ensgAnnotations = new HashMap(); + CSVReader refReader = new CSVReader(new FileReader(ensgAnnotationFile), '\t', '\0', '\0'); + refReader.readNext(); + String[] nextLine; + while ((nextLine = refReader.readNext()) != null) { + ensgAnnotations.put(nextLine[0], new GeneAnnotation(nextLine[0], nextLine[1], nextLine[2], Integer.valueOf(nextLine[3]), Integer.valueOf(nextLine[4]))); + } + return ensgAnnotations; + } +} \ No newline at end of file diff --git a/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/VectorSorter.java b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/VectorSorter.java new file mode 100644 index 000000000..3915cdb0c --- /dev/null +++ b/eQTLInteractionAnalyser/src/main/java/nl/systemsgenetics/eqtlinteractionanalyser/eqtlinteractionanalyser/VectorSorter.java @@ -0,0 +1,129 @@ +/* + * VectorSorter.java + * + * Created on 23 December 2003, 17:02 + */ + +package nl.systemsgenetics.eqtlinteractionanalyser.eqtlinteractionanalyser; + + import java.util.*; + import java.io.*; + + +/** An implementation of a quick sort algorithm. + * + * Shamelessly taken from Martin Senger's (senger@ebi.ac.uk) Java collection + * + * It sorts a vector of strings, but can be easily extended to sort + * a vector of other objects (by overwritting just one method + * lt(Object,Object)). + *

+ * It implements a generic version of C.A.R Hoare's Quick Sort + * algorithm. + *

+ * The code is based on example given in java swing package. + *

+ * + * This is an example how to use this class to sort a vector of strings: + *

+ *    Vector v = new Vector();
+ *    v.addElement ("X");
+ *    v.addElement ("A");
+ *    new Sorter().sort (v);
+ * 
+ * + * @author Martin Senger + * @version $Id: VectorSorter.java,v 1.1.1.1 2004/01/26 09:27:02 lude Exp $ + * @see TestSorter + */ +public class VectorSorter { + + /**************************************************************************** + * A default constructor. It does nothing. + ****************************************************************************/ + public VectorSorter() { + } + + /**************************************************************************** + * Sort the given vector. + * By default it is assumed that the vector contains elements of type String. + * If not a subclass must be written which overwrites method + * lt(Object,Object). + *

+ * @param v a vector to be sorted + ****************************************************************************/ + public void sort (Vector v) { + quickSort (v, 0, v.size() - 1); + } + + /**************************************************************************** + * Compare two objects. + *

+ * By default this method works for Strings. It is meant to be overwritten + * for other objects. + *

+ * @param a the first object to be compared + * @param b the second object to be compared + * @return true if the first object is lower than the second one + ****************************************************************************/ + protected boolean lt (Object a, Object b) { + return ((String)a).compareTo ((String)b) < 0; + } + + /**************************************************************************** + * The main algorithm. + ****************************************************************************/ + private void quickSort (Vector v, int lo0, int hi0) { + int lo = lo0; + int hi = hi0; + Object mid; + + if (hi0 > lo0) { + // Arbitrarily establishing partition element as the midpoint of + // the array. + mid = v.elementAt ((lo0 + hi0) / 2); + + // loop through the array until indices cross + while (lo <= hi) { + // find the first element that is greater than or equal to + // the partition element starting from the left Index. + while ((lo < hi0) && lt (v.elementAt (lo), mid)) { + ++lo; + } + + // find an element that is smaller than or equal to + // the partition element starting from the right Index. + while ((hi > lo0) && lt (mid, v.elementAt(hi))) { + --hi; + } + + // if the indexes have not crossed, swap + if (lo <= hi) { + swap (v, lo, hi); + ++lo; + --hi; + } + } + + + // If the right index has not reached the left side of array + // must now sort the left partition. + if (lo0 < hi) { + quickSort (v, lo0, hi); + } + + // If the left index has not reached the right side of array + // must now sort the right partition. + if (lo < hi0) { + quickSort (v, lo, hi0); + } + } + } + + private static void swap (Vector a, int i, int j) { + Object T = a.elementAt(i); + a.setElementAt (a.elementAt(j), i); + a.setElementAt (T, j); + } + +} diff --git a/eQTLInteractionAnalyser/src/test/java/nl/systemsgenetics/eqtlinteractionanalyser/AppTest.java b/eQTLInteractionAnalyser/src/test/java/nl/systemsgenetics/eqtlinteractionanalyser/AppTest.java new file mode 100644 index 000000000..fd6b2c4e5 --- /dev/null +++ b/eQTLInteractionAnalyser/src/test/java/nl/systemsgenetics/eqtlinteractionanalyser/AppTest.java @@ -0,0 +1,38 @@ +package nl.systemsgenetics.eqtlinteractionanalyser; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; + +/** + * Unit test for simple App. + */ +public class AppTest + extends TestCase +{ + /** + * Create the test case + * + * @param testName name of the test case + */ + public AppTest( String testName ) + { + super( testName ); + } + + /** + * @return the suite of tests being tested + */ + public static Test suite() + { + return new TestSuite( AppTest.class ); + } + + /** + * Rigourous Test :-) + */ + public void testApp() + { + assertTrue( true ); + } +} diff --git a/eqtl-functional-enrichment/src/main/java/nl/systemsgenetics/eqtlpermutationtranscriptionfactoranalysis/Testing.java b/eqtl-functional-enrichment/src/main/java/nl/systemsgenetics/eqtlpermutationtranscriptionfactoranalysis/Testing.java index 17591ce62..cf168a21d 100644 --- a/eqtl-functional-enrichment/src/main/java/nl/systemsgenetics/eqtlpermutationtranscriptionfactoranalysis/Testing.java +++ b/eqtl-functional-enrichment/src/main/java/nl/systemsgenetics/eqtlpermutationtranscriptionfactoranalysis/Testing.java @@ -8,7 +8,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.regex.Pattern; -import org.bouncycastle.util.Arrays; +//import org.bouncycastle.util.Arrays; import umcg.genetica.genomicboundaries.GenomicBoundaries; import umcg.genetica.graphics.ViolinBoxPlot; import umcg.genetica.io.text.TextFile; diff --git a/eqtl-mapping-pipeline/nb-configuration.xml b/eqtl-mapping-pipeline/nb-configuration.xml index a576e1d3e..15fa6c5e7 100644 --- a/eqtl-mapping-pipeline/nb-configuration.xml +++ b/eqtl-mapping-pipeline/nb-configuration.xml @@ -6,15 +6,6 @@ The configuration is intended to be shared among all the users of project and therefore it is assumed to be part of version control checkout. Without this configuration present, some functionality in the IDE may be limited or fail altogether. --> - - - JDK_1.7 - @@ -25,4 +16,13 @@ Any value defined here will override the pom.xml file value but is only applicab + + + JDK_1.7 + diff --git a/eqtl-mapping-pipeline/pom.xml b/eqtl-mapping-pipeline/pom.xml index 6b95cd552..47279a84f 100644 --- a/eqtl-mapping-pipeline/pom.xml +++ b/eqtl-mapping-pipeline/pom.xml @@ -1,142 +1,156 @@ - - nl.systemsgenetics - systemsgenetics - 1.0.2-SNAPSHOT - - eqtl-mapping-pipeline - 1.3.1-SNAPSHOT - jar - 4.0.0 - - - nl.systemsgenetics - genetica-libraries - 1.0.5 - - - log4j - log4j - 1.2.17 - - - nl.systemsgenetics - Genotype-IO - 1.0.1 - - - net.sf.trove4j - trove4j - 3.0.3 - - - commons-cli - commons-cli - 1.2 - - - commons-beanutils - commons-beanutils - 1.8.3 - - - commons-codec - commons-codec - 1.5 - - - commons-digester - commons-digester - 2.0 - - - net.sourceforge.parallelcolt - parallelcolt - 0.10.0 - - - ${project.groupId} - imputation-tool - 1.0.3 - - - net.rforge - Rserve - 0.6-8.1 - - - org.testng - testng - 6.5.2 - test - - - net.sf.opencsv - opencsv - 2.3 - - - - - - src/main/resources - true - - **/version.properties - - - - eqtl-mapping-pipeline-${project.version} - - - - org.apache.maven.plugins - maven-assembly-plugin - 2.4 - - - - - src/main/assembly/assembly.xml - - - - - make-assembly - package - - single - - - - - - - org.apache.maven.plugins - maven-jar-plugin - 2.3.1 - - - - - - - - true - - lib/ - - eqtlmappingpipeline.Main - - - - - - + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + + nl.systemsgenetics + systemsgenetics + 1.0.2-SNAPSHOT + + eqtl-mapping-pipeline + 1.3.5-SNAPSHOT + jar + 4.0.0 + + + nl.systemsgenetics + genetica-libraries + 1.0.7-SNAPSHOT + + + log4j + log4j + 1.2.17 + + + nl.systemsgenetics + Genotype-IO + 1.0.1 + + + net.sf.trove4j + trove4j + 3.0.3 + + + commons-cli + commons-cli + 1.2 + + + commons-beanutils + commons-beanutils + 1.8.3 + + + commons-codec + commons-codec + 1.5 + + + commons-digester + commons-digester + 2.0 + + + net.sourceforge.parallelcolt + parallelcolt + 0.10.0 + + + ${project.groupId} + imputation-tool + 1.0.3 + + + net.rforge + Rserve + 0.6-8.1 + + + org.testng + testng + 6.5.2 + test + + + net.sf.opencsv + opencsv + 2.3 + + + org.apache.commons + commons-collections4 + 4.0 + + + + + + src/main/resources + true + + **/version.properties + + + + eqtl-mapping-pipeline-${project.version} + + + + org.apache.maven.plugins + maven-assembly-plugin + 2.4 + + + + + src/main/assembly/assembly.xml + + + + + make-assembly + package + + single + + + + + + + org.apache.maven.plugins + maven-jar-plugin + 2.3.1 + + + + + + + + true + + lib/ + + eqtlmappingpipeline.Main + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.3.2 + + 1.7 + 1.7 + + + + \ No newline at end of file diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/BinaryInteractionMetaAnalysis.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/BinaryInteractionMetaAnalysis.java index 826f03f09..02619ca1a 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/BinaryInteractionMetaAnalysis.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/BinaryInteractionMetaAnalysis.java @@ -1,6 +1,15 @@ package eqtlmappingpipeline.binaryInteraction; import eqtlmappingpipeline.Main; +import org.apache.commons.cli.*; +import org.molgenis.genotype.Allele; +import umcg.genetica.io.binInteraction.*; +import umcg.genetica.io.binInteraction.gene.BinaryInteractionGene; +import umcg.genetica.io.binInteraction.gene.BinaryInteractionGeneCreator; +import umcg.genetica.io.binInteraction.variant.BinaryInteractionVariant; +import umcg.genetica.io.binInteraction.variant.BinaryInteractionVariantCreator; +import umcg.genetica.io.trityper.util.BaseAnnot; + import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; @@ -11,27 +20,8 @@ import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.HelpFormatter; -import org.apache.commons.cli.Option; -import org.apache.commons.cli.OptionBuilder; -import org.apache.commons.cli.Options; -import org.apache.commons.cli.ParseException; -import org.apache.commons.cli.PosixParser; -import org.molgenis.genotype.Allele; -import umcg.genetica.io.binInteraction.BinaryInteractionCohort; -import umcg.genetica.io.binInteraction.BinaryInteractionFile; -import umcg.genetica.io.binInteraction.BinaryInteractionFileCreator; -import umcg.genetica.io.binInteraction.BinaryInteractionFileException; -import umcg.genetica.io.binInteraction.BinaryInteractionQtlZscores; -import umcg.genetica.io.binInteraction.BinaryInteractionZscores; -import umcg.genetica.io.binInteraction.gene.BinaryInteractionGene; -import umcg.genetica.io.binInteraction.gene.BinaryInteractionGeneCreator; -import umcg.genetica.io.binInteraction.variant.BinaryInteractionVariant; -import umcg.genetica.io.binInteraction.variant.BinaryInteractionVariantCreator; /** - * * @author Patrick Deelen */ public class BinaryInteractionMetaAnalysis { @@ -39,18 +29,18 @@ public class BinaryInteractionMetaAnalysis { private static final String VERSION = Main.VERSION; private static final String HEADER = " /---------------------------------------\\\n" - + " | Binary interaction meta analysis |\n" - + " | |\n" - + " | Patrick Deelen |\n" - + " | patrickdeelen@gmail.com |\n" - + " | |\n" - + " | Dasha Zhernakova, Marc Jan Bonder |\n" - + " | Lude Franke, Morris Swertz |\n" - + " | |\n" - + " | Genomics Coordication Center |\n" - + " | Department of Genetics |\n" - + " | University Medical Center Groningen |\n" - + " \\---------------------------------------/"; + + " | Binary interaction meta analysis |\n" + + " | |\n" + + " | Patrick Deelen |\n" + + " | patrickdeelen@gmail.com |\n" + + " | |\n" + + " | Dasha Zhernakova, Marc Jan Bonder |\n" + + " | Lude Franke, Morris Swertz |\n" + + " | |\n" + + " | Genomics Coordication Center |\n" + + " | Department of Genetics |\n" + + " | University Medical Center Groningen |\n" + + " \\---------------------------------------/"; private static final DateFormat DATE_TIME_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); private static final Date currentDataTime = new Date(); private static final Options OPTIONS; @@ -168,13 +158,25 @@ public static void main(String[] args) throws UnsupportedEncodingException, IOEx BinaryInteractionVariantCreator metaVariant = variants.get(variant.getName()); - if (!(metaVariant.getRefAllele() == variant.getRefAllele() && metaVariant.getAltAllele() == variant.getAltAllele()) - && !(metaVariant.getRefAllele() == variant.getAltAllele() && metaVariant.getAltAllele() == variant.getRefAllele())) { + Boolean flipAlleles = BaseAnnot.flipalleles(variant.getRefAllele().getAlleleAsString() + "/" + variant.getAltAllele().getAlleleAsString(), variant.getRefAllele().getAlleleAsString(), + metaVariant.getRefAllele().getAlleleAsString() + "/" + metaVariant.getAltAllele().getAlleleAsString(), metaVariant.getRefAllele().getAlleleAsString()); + + +// if (!(metaVariant.getRefAllele() == variant.getRefAllele() && metaVariant.getAltAllele() == variant.getAltAllele()) +// && !(metaVariant.getRefAllele() == variant.getAltAllele() && metaVariant.getAltAllele() == variant.getRefAllele())) { +// System.err.println("Error: different alleles detected for variant: " + variant.getName()); +// System.exit(1); +// return; +// } + + + if (flipAlleles == null) { System.err.println("Error: different alleles detected for variant: " + variant.getName()); + System.err.println("Expected: " + metaVariant.getRefAllele().getAlleleAsString() + " / " + metaVariant.getAltAllele().getAlleleAsString()); + System.err.println("Found: " + variant.getRefAllele().getAlleleAsString() + " / " + variant.getAltAllele().getAlleleAsString()); System.exit(1); return; } - } for (int geneIndex : variant.getGenePointers()) { variantGenes.add(new VariantGene(variant.getName(), fileGenes.get(geneIndex).getName())); @@ -283,13 +285,18 @@ public static void main(String[] args) throws UnsupportedEncodingException, IOEx if (binaryInteractionFile.containsVariantGene(variantName, geneName)) { - boolean swap = binaryInteractionFile.getVariant(variantName).getAltAllele() != assessedAllele; + BinaryInteractionVariant currentVariant = binaryInteractionFile.getVariant(variantName); + // boolean swap = binaryInteractionFile.getVariant(variantName).getAltAllele() != assessedAllele; + + // sorry for the ugly code :| + Boolean flipAlleles = BaseAnnot.flipalleles(variant.getRefAllele().getAlleleAsString() + "/" + variant.getAltAllele().getAlleleAsString(), assessedAllele.getAlleleAsString(), + currentVariant.getRefAllele().getAlleleAsString() + "/" + currentVariant.getAltAllele().getAlleleAsString(), currentVariant.getAltAllele().getAlleleAsString()); BinaryInteractionQtlZscores qtlRes = binaryInteractionFile.readQtlResults(variantName, geneName); for (int j = 0; j < binaryInteractionFile.getCohortCount(); ++j) { sampleCountsQtl[i] = qtlRes.getSampleCounts()[j]; zscoresQtl[i] = qtlRes.getZscores()[j]; - if (swap) { + if (flipAlleles) { zscoresQtl[i] *= -1; } ++i; @@ -341,7 +348,15 @@ public static void main(String[] args) throws UnsupportedEncodingException, IOEx BinaryInteractionZscores interactionRes = binaryInteractionFile.readInteractionResults(variantName, geneName, covariate); - boolean swap = binaryInteractionFile.getVariant(variantName).getAltAllele() != assessedAllele; +// boolean swap = binaryInteractionFile.getVariant(variantName).getAltAllele() != assessedAllele; + + BinaryInteractionVariant currentVariant = binaryInteractionFile.getVariant(variantName); + // boolean swap = binaryInteractionFile.getVariant(variantName).getAltAllele() != assessedAllele; + + // sorry for the ugly code :| + Boolean flipAlleles = BaseAnnot.flipalleles(variant.getRefAllele().getAlleleAsString() + "/" + variant.getAltAllele().getAlleleAsString(), assessedAllele.getAlleleAsString(), + currentVariant.getRefAllele().getAlleleAsString() + "/" + currentVariant.getAltAllele().getAlleleAsString(), currentVariant.getAltAllele().getAlleleAsString()); + for (int j = 0; j < binaryInteractionFile.getCohortCount(); ++j) { sampleCountsInteraction[i] = interactionRes.getSamplesInteractionCohort()[j]; @@ -350,7 +365,7 @@ public static void main(String[] args) throws UnsupportedEncodingException, IOEx zscoreInteractionCohort[i] = interactionRes.getZscoreInteractionCohort()[j]; rSquaredCohort[i] = interactionRes.getrSquaredCohort()[j]; zscoreInteractionFlippedCohort[i] = interactionRes.getZscoreInteractionFlippedCohort()[j]; - if (swap) { + if (flipAlleles) { zscoreSnpCohort[i] *= -1; zscoreInteractionCohort[i] *= -1; zscoreInteractionFlippedCohort[i] *= -1; diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/CovariateImportance.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/CovariateImportance.java index 8e1d38cf2..48d0b130f 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/CovariateImportance.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/CovariateImportance.java @@ -2,11 +2,15 @@ import au.com.bytecode.opencsv.CSVWriter; import gnu.trove.map.hash.TObjectDoubleHashMap; +import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; +import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; +import java.io.InputStreamReader; +import java.util.HashSet; import java.util.Iterator; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; @@ -46,6 +50,18 @@ public class CovariateImportance { OptionBuilder.isRequired(); OPTIONS.addOption(OptionBuilder.create("o")); + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("File with covariates to include in analysis"); + OptionBuilder.withLongOpt("covariats"); + OPTIONS.addOption(OptionBuilder.create("c")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("File with eQTL genes to include in analysis"); + OptionBuilder.withLongOpt("genes"); + OPTIONS.addOption(OptionBuilder.create("g")); + } /** @@ -55,6 +71,8 @@ public static void main(String[] args) throws FileNotFoundException, IOException final File inputInteractionFile; final File outputFile; + final File covariatesToIncludeFile; + final File genesToIncludeFile; try { final CommandLine commandLine = new PosixParser().parse(OPTIONS, args, false); @@ -62,6 +80,18 @@ public static void main(String[] args) throws FileNotFoundException, IOException inputInteractionFile = new File(commandLine.getOptionValue("i")); outputFile = new File(commandLine.getOptionValue("o")); + if (commandLine.hasOption("c")) { + covariatesToIncludeFile = new File(commandLine.getOptionValue("c")); + } else { + covariatesToIncludeFile = null; + } + + if (commandLine.hasOption("g")) { + genesToIncludeFile = new File(commandLine.getOptionValue("g")); + } else { + genesToIncludeFile = null; + } + } catch (ParseException ex) { System.err.println("Invalid command line arguments: "); System.err.println(ex.getMessage()); @@ -73,6 +103,40 @@ public static void main(String[] args) throws FileNotFoundException, IOException System.out.println("Input file: " + inputInteractionFile.getAbsolutePath()); System.out.println("Output file: " + outputFile); + if (covariatesToIncludeFile != null) { + System.out.println("Covariates to include: " + covariatesToIncludeFile.getAbsolutePath()); + } + if (genesToIncludeFile != null) { + System.out.println("eQTL genes to include: " + genesToIncludeFile.getAbsolutePath()); + } + + final HashSet genesToInclude; + if (genesToIncludeFile != null) { + genesToInclude = new HashSet(); + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(genesToIncludeFile), "UTF-8")); + String line; + while ((line = reader.readLine()) != null) { + genesToInclude.add(line.trim()); + } + System.out.println("eQTL genes included: " + genesToInclude.size()); + System.out.println(""); + } else { + genesToInclude = null; + } + + final HashSet covariantsToInclude; + if (covariatesToIncludeFile != null) { + covariantsToInclude = new HashSet(); + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(covariatesToIncludeFile), "UTF-8")); + String line; + while ((line = reader.readLine()) != null) { + covariantsToInclude.add(line.trim()); + } + System.out.println("Covariates included: " + covariantsToInclude.size()); + System.out.println(); + } else { + covariantsToInclude = null; + } BinaryInteractionFile inputFile = BinaryInteractionFile.load(inputInteractionFile, true); @@ -82,16 +146,26 @@ public static void main(String[] args) throws FileNotFoundException, IOException for (BinaryInteractionVariant variant : inputFile.getVariants()) { String variantName = variant.getName(); - int[] genePointers = inputFile.getVariant(variantName).getGenePointers(); + int[] genePointers = variant.getGenePointers(); + genes: for (int genePointer : genePointers) { BinaryInteractionGene gene = inputFile.getGene(genePointer); + + if (genesToInclude != null && !genesToInclude.contains(gene.getName())) { + continue genes; + } covariates: for (Iterator iterator = inputFile.readVariantGeneResults(variantName, gene.getName()); iterator.hasNext();) { BinaryInteractionQueryResult interation = iterator.next(); + + if (covariantsToInclude != null && !covariantsToInclude.contains(interation.getCovariateName())) { + continue covariates; + } + double metaZ = interation.getInteractionZscores().getZscoreInteractionMeta(); if (Double.isNaN(metaZ)) { continue covariates; @@ -100,13 +174,16 @@ public static void main(String[] args) throws FileNotFoundException, IOException sumChi2.adjustOrPutValue(interation.getCovariateName(), chi2, chi2); } - } - ++reporter; - if (reporter % 500 == 0) { - System.out.println("Parsed " + reporter + " of " + inputFile.getVariantGeneCombinations() + " variant-gene combinations"); + ++reporter; + if (reporter % 500 == 0) { + System.out.println("Parsed " + reporter + " of " + inputFile.getVariantGeneCombinations() + " variant-gene combinations"); + } + } + + } CSVWriter outputWriter = new CSVWriter(new BufferedWriter(new FileWriter(outputFile)), '\t', '\0', '\0'); diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/InvestigateCovariate.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/InvestigateCovariate.java new file mode 100644 index 000000000..3074035ce --- /dev/null +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/InvestigateCovariate.java @@ -0,0 +1,483 @@ +package eqtlmappingpipeline.binaryInteraction; + +import au.com.bytecode.opencsv.CSVWriter; +import eqtlmappingpipeline.Main; +import gnu.trove.map.hash.TObjectIntHashMap; +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Writer; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashSet; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import umcg.genetica.io.binInteraction.BinaryInteractionFile; +import umcg.genetica.io.binInteraction.BinaryInteractionFileException; +import umcg.genetica.io.binInteraction.BinaryInteractionQueryResult; +import umcg.genetica.io.binInteraction.BinaryInteractionZscores; +import umcg.genetica.io.binInteraction.gene.BinaryInteractionGene; +import umcg.genetica.io.binInteraction.variant.BinaryInteractionVariant; +import umcg.genetica.math.matrix2.DoubleMatrixDataset; + +/** + * + * @author Patrick Deelen + */ +public class InvestigateCovariate { + + private static final Options OPTIONS; + + static { + + OPTIONS = new Options(); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Binary interaction file (must be a meta analysis)"); + OptionBuilder.withLongOpt("input"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("i")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Binary interaction file to use as replication (must be a meta analysis)"); + OptionBuilder.withLongOpt("replication"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("r")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Ouput prefix"); + OptionBuilder.withLongOpt("output"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("o")); + + OptionBuilder.withArgName("double"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Minimum absolute interaction z-score"); + OptionBuilder.withLongOpt("interactionZ"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("iz")); + + OptionBuilder.withArgName("double"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Minimum absolute replication interaction z-score"); + OptionBuilder.withLongOpt("replicationInteractionZ"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("riz")); + + OptionBuilder.withArgName("string"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Covariate name"); + OptionBuilder.withLongOpt("queryCovariate"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("qc")); + + OptionBuilder.withDescription("If set match variant on chr-pos"); + OptionBuilder.withLongOpt("chrPos"); + OPTIONS.addOption(OptionBuilder.create("cp")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("File with covariates to include in analysis"); + OptionBuilder.withLongOpt("covariats"); + OPTIONS.addOption(OptionBuilder.create("c")); + + } + + /** + * @param args the command line arguments + */ + public static void main(String[] args) throws IOException, FileNotFoundException, BinaryInteractionFileException { + + final File inputInteractionFile; + final File replicationInteractionFile; + final double minAbsInteractionZ; + final double minAbsReplicationInteractionZ; + final boolean matchOnChrPos; + final String outputPrefix; + final String queryCovariateName; + final File covariatesToIncludeFile; + + try { + final CommandLine commandLine = new PosixParser().parse(OPTIONS, args, false); + + inputInteractionFile = new File(commandLine.getOptionValue("i")); + replicationInteractionFile = new File(commandLine.getOptionValue("r")); + outputPrefix = commandLine.getOptionValue("o"); + queryCovariateName = commandLine.getOptionValue("qc"); + + try { + minAbsInteractionZ = Double.parseDouble(commandLine.getOptionValue("iz")); + } catch (NumberFormatException ex) { + System.out.println("Cannot not parse --interactionZ as double: " + commandLine.getOptionValue("iz")); + System.exit(1); + return; + } + + try { + minAbsReplicationInteractionZ = Double.parseDouble(commandLine.getOptionValue("riz")); + } catch (NumberFormatException ex) { + System.out.println("Cannot not parse --replicationInteractionZ as double: " + commandLine.getOptionValue("riz")); + System.exit(1); + return; + } + + if (commandLine.hasOption("c")) { + covariatesToIncludeFile = new File(commandLine.getOptionValue("c")); + } else { + covariatesToIncludeFile = null; + } + + matchOnChrPos = commandLine.hasOption("cp"); + + } catch (ParseException ex) { + System.err.println("Invalid command line arguments: "); + System.err.println(ex.getMessage()); + System.err.println(); + new HelpFormatter().printHelp(" ", OPTIONS); + System.exit(1); + return; + } + BufferedWriter logWriter = new BufferedWriter(new FileWriter(outputPrefix + "_Log.txt")); + + writeAndOut("Software version: " + Main.VERSION, logWriter); + writeAndOut("Input file: " + inputInteractionFile.getAbsolutePath(), logWriter); + writeAndOut("Replication file: " + replicationInteractionFile.getAbsolutePath(), logWriter); + writeAndOut("Query covariate: " + queryCovariateName, logWriter); + writeAndOut("Output prefix: " + outputPrefix, logWriter); + writeAndOut("Min interaction z-score: " + minAbsInteractionZ, logWriter); + writeAndOut("Min replication interaction z-score: " + minAbsReplicationInteractionZ, logWriter); + if (matchOnChrPos) { + writeAndOut("Matching variants on chr-pos", logWriter); + } + if (covariatesToIncludeFile != null) { + writeAndOut("Covariates to include: " + covariatesToIncludeFile.getAbsolutePath(), logWriter); + } + writeAndOut("", logWriter); + + final HashSet covariantsToIncluded; + if (covariatesToIncludeFile != null) { + covariantsToIncluded = new HashSet(); + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(covariatesToIncludeFile), "UTF-8")); + String line; + while ((line = reader.readLine()) != null) { + covariantsToIncluded.add(line.trim()); + } + writeAndOut("Covariates included: " + covariantsToIncluded.size(), logWriter); + writeAndOut("", logWriter); + + if (!covariantsToIncluded.contains(queryCovariateName)) { + System.err.println("Query covariate not in include list"); + System.exit(1); + return; + } + + } else { + covariantsToIncluded = null; + } + + BinaryInteractionFile inputFile = BinaryInteractionFile.load(inputInteractionFile, true); + BinaryInteractionFile replicationFile = BinaryInteractionFile.load(replicationInteractionFile, true); + + LinkedHashSet genesOfInterest = new LinkedHashSet(); + + if (!inputFile.containsCovariant(queryCovariateName)) { + System.err.println("Covariate not found in input data"); + System.exit(1); + return; + } + + if (!replicationFile.containsCovariant(queryCovariateName)) { + System.err.println("Covariate not found in replication data"); + System.exit(1); + return; + } + + variants: + for (final BinaryInteractionVariant variant : inputFile.getVariants()) { + + final String variantName = variant.getName(); + + final BinaryInteractionVariant replicationVariant; + + if (matchOnChrPos) { + replicationVariant = replicationFile.getVariant(variant.getChr(), variant.getPos()); + if (replicationVariant == null) { + continue variants; + } + } else { + if (replicationFile.containsVariant(variantName)) { + replicationVariant = replicationFile.getVariant(variantName); + } else { + continue variants; + } + } + + //Only do if replication variant has been found + + if (!(variant.getRefAllele() == replicationVariant.getRefAllele() && variant.getAltAllele() == replicationVariant.getAltAllele()) + && !(variant.getRefAllele() == replicationVariant.getAltAllele() && variant.getAltAllele() == replicationVariant.getRefAllele())) { + System.err.println("Allele mismatch!"); + } + final boolean swap = variant.getAltAllele() != replicationVariant.getAltAllele(); + + final int[] genePointers = inputFile.getVariant(variantName).getGenePointers(); + + genes: + for (int genePointer : genePointers) { + + final BinaryInteractionGene gene = inputFile.getGene(genePointer); + + if (!inputFile.containsInteraction(variantName, gene.getName(), queryCovariateName)) { + continue genes; + } + + if (!replicationFile.containsInteraction(replicationVariant.getName(), gene.getName(), queryCovariateName)) { + continue genes; + } + + if (!replicationFile.containsInteraction(replicationVariant.getName(), gene.getName(), queryCovariateName)) { + continue genes; + } + + final BinaryInteractionZscores inputInteractionResult = inputFile.readInteractionResults(variantName, gene.getName(), queryCovariateName); + final double inputInteractionZ = inputInteractionResult.getZscoreInteractionMeta(); + + if (Double.isNaN(inputInteractionZ)) { + continue genes; + } + + if (!(inputInteractionZ <= -minAbsInteractionZ || inputInteractionZ >= minAbsInteractionZ)) { + continue genes; + } + + if(!replicationFile.containsInteraction(replicationVariant.getName(), gene.getName(), queryCovariateName)){ + continue genes; + } + + final BinaryInteractionZscores replicationInteractionResult = replicationFile.readInteractionResults(replicationVariant.getName(), gene.getName(), queryCovariateName); + double replicationInteractionZ = replicationInteractionResult.getZscoreInteractionMeta(); + + if (Double.isNaN(replicationInteractionZ)) { + continue genes; + } + + if (!(replicationInteractionZ <= -minAbsReplicationInteractionZ || replicationInteractionZ >= minAbsReplicationInteractionZ)) { + continue genes; + } + + //If here then discovery and replication significant + + if (swap) { + replicationInteractionZ *= -1; + } + + if (inputInteractionZ * replicationInteractionZ >= 0) { + //Same direction + genesOfInterest.add(gene.getName()); + } + + + + } + + } + + System.out.println("Number of genes of interest: " + genesOfInterest.size()); + + TObjectIntHashMap covaraitesOfInterestCount = new TObjectIntHashMap(); + TObjectIntHashMap genesOfInterestCount = new TObjectIntHashMap(); + LinkedHashSet covaraitesOfInterest = new LinkedHashSet(); + + //Here we now know which genes are of interest. + //We are now going to search for other covariates that are significant for any of these genes + for (String geneName : genesOfInterest) { + BinaryInteractionGene gene = inputFile.getGene(geneName); + + variants: + for (int variantPointer : gene.getVariantPointers()) { + BinaryInteractionVariant variant = inputFile.getVariant(variantPointer); + + final String variantName = variant.getName(); + + final BinaryInteractionVariant replicationVariant; + + if (matchOnChrPos) { + replicationVariant = replicationFile.getVariant(variant.getChr(), variant.getPos()); + if (replicationVariant == null) { + continue variants; + } + } else { + if (replicationFile.containsVariant(variantName)) { + replicationVariant = replicationFile.getVariant(variantName); + } else { + continue variants; + } + } + + //Only do if replication variant has been found + + if (!(variant.getRefAllele() == replicationVariant.getRefAllele() && variant.getAltAllele() == replicationVariant.getAltAllele()) + && !(variant.getRefAllele() == replicationVariant.getAltAllele() && variant.getAltAllele() == replicationVariant.getRefAllele())) { + System.err.println("Allele mismatch!"); + } + final boolean swap = variant.getAltAllele() != replicationVariant.getAltAllele(); + + covairates: + for (Iterator iterator = inputFile.readVariantGeneResults(variantName, gene.getName()); iterator.hasNext();) { + + BinaryInteractionQueryResult interaction = iterator.next(); + + if (covariantsToIncluded != null && !covariantsToIncluded.contains(interaction.getCovariateName())) { + continue covairates; + } + + final BinaryInteractionZscores inputInteractionResult = interaction.getInteractionZscores(); + final double inputInteractionZ = inputInteractionResult.getZscoreInteractionMeta(); + + if (Double.isNaN(inputInteractionZ)) { + continue; + } + + if (!(inputInteractionZ <= -minAbsInteractionZ || inputInteractionZ >= minAbsInteractionZ)) { + continue covairates; + } + + if(!replicationFile.containsInteraction(replicationVariant.getName(), gene.getName(), interaction.getCovariateName())){ + continue covairates; + } + + final BinaryInteractionZscores replicationInteractionResult = replicationFile.readInteractionResults(replicationVariant.getName(), gene.getName(), interaction.getCovariateName()); + double replicationInteractionZ = replicationInteractionResult.getZscoreInteractionMeta(); + + if (Double.isNaN(replicationInteractionZ)) { + continue covairates; + } + + if (!(replicationInteractionZ <= -minAbsReplicationInteractionZ || replicationInteractionZ >= minAbsReplicationInteractionZ)) { + continue covairates; + } + + //If here then discovery and replication significant + + if (swap) { + replicationInteractionZ *= -1; + } + + if (inputInteractionZ * replicationInteractionZ >= 0) { + //Same direction + covaraitesOfInterestCount.adjustOrPutValue(interaction.getCovariateName(), 1, 1); + covaraitesOfInterest.add(interaction.getCovariateName()); + genesOfInterestCount.adjustOrPutValue(geneName, 1, 1); + } + + } + + } + + } + //We now also know which other covariates are of interest + + System.out.println("Number of covariates of interest: " + covaraitesOfInterest.size()); + + + writeCounts(genesOfInterest, genesOfInterestCount, new File(outputPrefix + "_Genes.txt")); + writeCounts(covaraitesOfInterest, covaraitesOfInterestCount, new File(outputPrefix + "_Covariates.txt")); + + DoubleMatrixDataset interactionZscores = new DoubleMatrixDataset(covaraitesOfInterest, genesOfInterest); + + DoubleMatrixDataset replicationInteractionZscores = new DoubleMatrixDataset(covaraitesOfInterest, genesOfInterest); + + for (String geneName : genesOfInterest) { + + BinaryInteractionGene gene = inputFile.getGene(geneName); + + variants: + for (int variantPointer : gene.getVariantPointers()) { + BinaryInteractionVariant variant = inputFile.getVariant(variantPointer); + + final String variantName = variant.getName(); + + final BinaryInteractionVariant replicationVariant; + + if (matchOnChrPos) { + replicationVariant = replicationFile.getVariant(variant.getChr(), variant.getPos()); + if (replicationVariant == null) { + continue variants; + } + } else { + if (replicationFile.containsVariant(variantName)) { + replicationVariant = replicationFile.getVariant(variantName); + } else { + continue variants; + } + } + + //Only do if replication variant has been found + + if (!(variant.getRefAllele() == replicationVariant.getRefAllele() && variant.getAltAllele() == replicationVariant.getAltAllele()) + && !(variant.getRefAllele() == replicationVariant.getAltAllele() && variant.getAltAllele() == replicationVariant.getRefAllele())) { + System.err.println("Allele mismatch!"); + } + final boolean swap = variant.getAltAllele() != replicationVariant.getAltAllele(); + + + for (String covariateName : covaraitesOfInterest) { + + final BinaryInteractionZscores inputInteractionResult = inputFile.readInteractionResults(variantName, geneName, covariateName); + + //System.out.println(covariateName + "-" + geneName + "-" + inputInteractionResult.getZscoreInteractionMeta()); + interactionZscores.setElement(covariateName, geneName, inputInteractionResult.getZscoreInteractionMeta()); + + final BinaryInteractionZscores replicationInteractionResult = replicationFile.readInteractionResults(replicationVariant.getName(), gene.getName(), covariateName); + double replicationInteractionZ = replicationInteractionResult.getZscoreInteractionMeta(); + if (swap) { + replicationInteractionZ *= -1; + } + replicationInteractionZscores.setElement(covariateName, geneName, replicationInteractionZ); + + } + + + } + } + + interactionZscores.save(outputPrefix + "_InteractionMatrix.txt"); + replicationInteractionZscores.save(outputPrefix + "_ReplicationInteractionMatrix.txt"); + + logWriter.close(); + + } + + private static void writeCounts(LinkedHashSet elements, TObjectIntHashMap counts, File file) throws IOException { + CSVWriter writer = new CSVWriter(new BufferedWriter(new FileWriter(file)), '\t', '\0', '\0'); + + String[] row = new String[2]; + + for (String elementName : elements) { + row[0] = elementName; + row[1] = String.valueOf(counts.get(elementName)); + writer.writeNext(row); + } + + writer.close(); + + } + + private static void writeAndOut(String message, Writer writer) throws IOException { + writer.append(message); + writer.append('\n'); + System.out.println(message); + } +} diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/QueryBinaryInteraction.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/QueryBinaryInteraction.java index bfb79934b..17978aafb 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/QueryBinaryInteraction.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/QueryBinaryInteraction.java @@ -1,9 +1,12 @@ package eqtlmappingpipeline.binaryInteraction; +import au.com.bytecode.opencsv.CSVReader; import au.com.bytecode.opencsv.CSVWriter; import eqtlmappingpipeline.Main; import java.io.BufferedWriter; import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.OutputStreamWriter; @@ -12,14 +15,17 @@ import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Date; +import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashSet; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; -import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; +import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression; +import umcg.genetica.containers.Pair; import umcg.genetica.io.binInteraction.BinaryInteractionCohort; import umcg.genetica.io.binInteraction.BinaryInteractionFile; import umcg.genetica.io.binInteraction.BinaryInteractionFileException; @@ -44,8 +50,6 @@ public class QueryBinaryInteraction { OPTIONS = new Options(); - Option option; - OptionBuilder.withArgName("path"); OptionBuilder.hasArg(); OptionBuilder.withDescription("Binary interaction file"); @@ -68,7 +72,7 @@ public class QueryBinaryInteraction { OptionBuilder.withArgName("string"); OptionBuilder.hasArg(); OptionBuilder.withDescription("Covariate name (optional)"); - OptionBuilder.withLongOpt("cocariate"); + OptionBuilder.withLongOpt("covariate"); OPTIONS.addOption(OptionBuilder.create("c")); OptionBuilder.withArgName("string"); @@ -77,12 +81,22 @@ public class QueryBinaryInteraction { OptionBuilder.withLongOpt("variant"); OPTIONS.addOption(OptionBuilder.create("v")); + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("File with queries. Must have header. All columns are optional, options are gene, variant and covariate. Any combination of headers is possible (optional)"); + OptionBuilder.withLongOpt("queryFile"); + OPTIONS.addOption(OptionBuilder.create("qf")); + OptionBuilder.withArgName("double"); OptionBuilder.hasArg(); OptionBuilder.withDescription("Minimum absolute interaction z-score (not yet implemented)"); OptionBuilder.withLongOpt("interactionZ"); OPTIONS.addOption(OptionBuilder.create("iz")); + OptionBuilder.withDescription("Only output meta z-scores"); + OptionBuilder.withLongOpt("onlyMetaZ"); + OPTIONS.addOption(OptionBuilder.create("oz")); + } public static void main(String[] args) throws UnsupportedEncodingException, IOException, Exception { @@ -93,6 +107,8 @@ public static void main(String[] args) throws UnsupportedEncodingException, IOEx final String queryCovariateName; final String queryVariantName; final double queryMinAbsInteractionZ; + final File queryFile; + final boolean onlyOutputMetaZ; try { final CommandLine commandLine = new PosixParser().parse(OPTIONS, args, false); @@ -107,11 +123,22 @@ public static void main(String[] args) throws UnsupportedEncodingException, IOEx queryCovariateName = commandLine.getOptionValue("c"); queryVariantName = commandLine.getOptionValue("v"); + if (commandLine.hasOption("qf")) { + queryFile = new File(commandLine.getOptionValue("qf")); + if (queryGeneName != null || queryVariantName != null || queryCovariateName != null) { + System.err.println("Cannot combine query file with commandline query arguments"); + System.exit(1); + return; + } + } else { + queryFile = null; + } + if (commandLine.hasOption("iz")) { try { queryMinAbsInteractionZ = Double.parseDouble(commandLine.getOptionValue("iz")); } catch (NumberFormatException ex) { - System.out.println("Cannot not parse interactionZ as double: " + commandLine.getOptionValue("iz")); + System.err.println("Cannot not parse interactionZ as double: " + commandLine.getOptionValue("iz")); System.exit(1); return; } @@ -119,6 +146,8 @@ public static void main(String[] args) throws UnsupportedEncodingException, IOEx queryMinAbsInteractionZ = -1; } + onlyOutputMetaZ = commandLine.hasOption("oz"); + } catch (ParseException ex) { System.err.println("Invalid command line arguments: "); System.err.println(ex.getMessage()); @@ -131,7 +160,7 @@ public static void main(String[] args) throws UnsupportedEncodingException, IOEx BinaryInteractionFile inputFile = BinaryInteractionFile.load(inputInteractionFile, true); final Writer outputWriter; - if (outputFile != null) { + if (outputFile != null && !onlyOutputMetaZ) { outputWriter = new BufferedWriter(new FileWriter(outputFile)); } else { outputWriter = new OutputStreamWriter(System.out); @@ -165,6 +194,14 @@ public static void main(String[] args) throws UnsupportedEncodingException, IOEx outputWriter.write("# - Query minimum absote interaction z-score: " + queryMinAbsInteractionZ); outputWriter.write('\n'); } + if (queryFile != null) { + outputWriter.write("# - Query file: " + queryFile.getAbsolutePath()); + outputWriter.write('\n'); + } + if (onlyOutputMetaZ) { + outputWriter.write("# - Only outputing meta z-scores"); + outputWriter.write('\n'); + } outputWriter.write("#\n"); outputWriter.write("# Interaction file meta data: "); @@ -187,70 +224,184 @@ public static void main(String[] args) throws UnsupportedEncodingException, IOEx outputWriter.write('\n'); outputWriter.write("#\n"); + outputWriter.flush(); + + final LinkedHashSet interactionQueries; + final boolean interactionQueriesOnlyCovariates; + if (queryFile != null) { + Pair, Boolean> loadRes = loadInteractionQueries(queryFile); + interactionQueries = loadRes.getLeft(); + interactionQueriesOnlyCovariates = loadRes.getRight(); + } else { + interactionQueries = null; + interactionQueriesOnlyCovariates = false; + } + + CSVWriter tableWriter; + if (onlyOutputMetaZ) { + if (outputFile == null) { + throw new Exception("Use of option --onlyMetaZ only possible in combination with output file"); + } + tableWriter = new CSVWriter(new BufferedWriter(new FileWriter(outputFile)), '\t', '\0', '\0'); + } else { + tableWriter = new CSVWriter(outputWriter, '\t', '\0', '\0'); + } + + if (onlyOutputMetaZ && !inputFile.isMetaAnalysis()) { + throw new Exception("No meta analysis information detected cannot use option: --onlyMetaZ"); + } + final String[] row; - CSVWriter tableWriter = new CSVWriter(outputWriter, '\t', '\0', '\0'); + if (onlyOutputMetaZ) { + row = new String[1]; + } else { + final int columnCount = 7 + + ((5 + (inputFile.isNormalQtlStored() ? 2 : 0) + (inputFile.isFlippedZscoreStored() ? 1 : 0)) * inputFile.getCohortCount()) + + (inputFile.isMetaAnalysis() ? (3 + (inputFile.isNormalQtlStored() ? 1 : 0) + (inputFile.isFlippedZscoreStored() ? 1 : 0)) : 0); - int columnCount = - 7 - + ((5 + (inputFile.isNormalQtlStored() ? 2 : 0) + (inputFile.isFlippedZscoreStored() ? 1 : 0)) * inputFile.getCohortCount()) - + (inputFile.isMetaAnalysis() ? (3 + (inputFile.isNormalQtlStored() ? 1 : 0) + (inputFile.isFlippedZscoreStored() ? 1 : 0)) : 0); - String[] row = new String[columnCount]; - int c = 0; + row = new String[columnCount]; + int c = 0; - row[c++] = "Variant"; - row[c++] = "Gene"; - row[c++] = "Covariate"; - row[c++] = "Variant_chr"; - row[c++] = "Variant_pos"; - row[c++] = "Variant alleles"; - row[c++] = "Assessed_allele"; + row[c++] = "Variant"; + row[c++] = "Gene"; + row[c++] = "Covariate"; + row[c++] = "Variant_chr"; + row[c++] = "Variant_pos"; + row[c++] = "Variant alleles"; + row[c++] = "Assessed_allele"; - for (BinaryInteractionCohort cohort : inputFile.getCohorts()) { + for (BinaryInteractionCohort cohort : inputFile.getCohorts()) { - String cohortName = cohort.getName(); + String cohortName = cohort.getName(); - if (inputFile.isNormalQtlStored()) { - row[c++] = cohortName + "_QTL_sample_count"; - row[c++] = cohortName + "_QTL_Z-score"; - } + if (inputFile.isNormalQtlStored()) { + row[c++] = cohortName + "_QTL_sample_count"; + row[c++] = cohortName + "_QTL_Z-score"; + } + + row[c++] = cohortName + "_interaction_sample_count"; + row[c++] = cohortName + "_interaction_r2"; + row[c++] = cohortName + "_variant_Z-score"; + row[c++] = cohortName + "_covariate_Z-score"; + row[c++] = cohortName + "_interaction_Z-score"; + + if (inputFile.isFlippedZscoreStored()) { + row[c++] = cohortName + "_flipped_interaction_Z-score"; + } - row[c++] = cohortName + "_interaction_sample_count"; - row[c++] = cohortName + "_interaction_r2"; - row[c++] = cohortName + "_variant_Z-score"; - row[c++] = cohortName + "_covariate_Z-score"; - row[c++] = cohortName + "_interaction_Z-score"; + } - if (inputFile.isFlippedZscoreStored()) { - row[c++] = cohortName + "_flipped_interaction_Z-score"; + if (inputFile.isMetaAnalysis()) { + if (inputFile.isNormalQtlStored()) { + row[c++] = "Meta_QTL_Z-score"; + } + row[c++] = "Meta_variant_Z-score"; + row[c++] = "Meta_covariate_Z-score"; + row[c++] = "Meta_interaction_Z-score"; + if (inputFile.isFlippedZscoreStored()) { + row[c++] = "Meta_flipped_interaction_Z-score"; + } } + tableWriter.writeNext(row); } - if (inputFile.isMetaAnalysis()) { - if (inputFile.isNormalQtlStored()) { - row[c++] = "Meta_QTL_Z-score"; + if (interactionQueries != null) { + + if (interactionQueriesOnlyCovariates) { + HashSet covariateNames = new HashSet<>(interactionQueries.size()); + for (InteractoinQuery interactionQuery : interactionQueries) { + covariateNames.add(interactionQuery.getCovariate()); + } + doQueryCovariates(covariateNames, inputFile, tableWriter, row, onlyOutputMetaZ); + } else { + for (InteractoinQuery interactionQuery : interactionQueries) { + doQuery(interactionQuery.getGene(), interactionQuery.getVariant(), interactionQuery.getCovariate(), inputFile, tableWriter, row, onlyOutputMetaZ); + } + } + + } else { + doQuery(queryGeneName, queryVariantName, queryCovariateName, inputFile, tableWriter, row, onlyOutputMetaZ); + } + + + + tableWriter.close(); + outputWriter.close(); + + } + + @SuppressWarnings({"null", "ConstantConditions"}) + private static void addRow(BinaryInteractionQueryResult queryRestult, BinaryInteractionFile inputFile, CSVWriter tableWriter, String[] row, boolean onlyOutputMetaZ) throws BinaryInteractionFileException, IOException { + + if (onlyOutputMetaZ) { + BinaryInteractionZscores zscroresInteraction = queryRestult.getInteractionZscores(); + row[0] = String.valueOf(zscroresInteraction.getZscoreInteractionMeta()); + } else { + int c = 0; + + row[c++] = queryRestult.getVariantName(); + row[c++] = queryRestult.getGeneName(); + row[c++] = queryRestult.getCovariateName(); + + BinaryInteractionVariant variant = inputFile.getVariant(queryRestult.getVariantName()); + row[c++] = variant.getChr(); + row[c++] = String.valueOf(variant.getPos()); + row[c++] = variant.getRefAllele().getAlleleAsString() + '/' + variant.getAltAllele().getAlleleAsString(); + row[c++] = variant.getAltAllele().toString(); + + BinaryInteractionQtlZscores zscroresQtl = queryRestult.getQtlZscores(); + BinaryInteractionZscores zscroresInteraction = queryRestult.getInteractionZscores(); + + for (int cohortIndex = 0; cohortIndex < inputFile.getCohortCount(); ++cohortIndex) { + + if (inputFile.isNormalQtlStored()) { + row[c++] = String.valueOf(zscroresQtl.getSampleCounts()[cohortIndex]); + row[c++] = String.valueOf(zscroresQtl.getZscores()[cohortIndex]); + } + + row[c++] = String.valueOf(zscroresInteraction.getSamplesInteractionCohort()[cohortIndex]); + row[c++] = String.valueOf(zscroresInteraction.getrSquaredCohort()[cohortIndex]); + row[c++] = String.valueOf(zscroresInteraction.getZscoreSnpCohort()[cohortIndex]); + row[c++] = String.valueOf(zscroresInteraction.getZscoreCovariateCohort()[cohortIndex]); + row[c++] = String.valueOf(zscroresInteraction.getZscoreInteractionCohort()[cohortIndex]); + + if (inputFile.isFlippedZscoreStored()) { + row[c++] = String.valueOf(zscroresInteraction.getZscoreInteractionFlippedCohort()[cohortIndex]); + } + } - row[c++] = "Meta_variant_Z-score"; - row[c++] = "Meta_covariate_Z-score"; - row[c++] = "Meta_interaction_Z-score"; - if (inputFile.isFlippedZscoreStored()) { - row[c++] = "Meta_flipped_interaction_Z-score"; + + if (inputFile.isMetaAnalysis()) { + if (inputFile.isNormalQtlStored()) { + row[c++] = String.valueOf(zscroresQtl.getMetaZscore()); + } + row[c++] = String.valueOf(zscroresInteraction.getZscoreSnpMeta()); + row[c++] = String.valueOf(zscroresInteraction.getZscoreCovariateMeta()); + row[c++] = String.valueOf(zscroresInteraction.getZscoreInteractionMeta()); + if (inputFile.isFlippedZscoreStored()) { + row[c++] = String.valueOf(zscroresInteraction.getZscoreInteractionFlippedMeta()); + } } } + tableWriter.writeNext(row); + } + + private static void doQuery(final String queryGeneName, final String queryVariantName, final String queryCovariateName, BinaryInteractionFile inputFile, CSVWriter tableWriter, String[] row, boolean onlyOutputMetaZ) throws IOException, BinaryInteractionFileException { if (queryGeneName != null && queryVariantName != null && queryCovariateName != null) { - addRow(inputFile.readVariantGeneCovariateResults(queryVariantName, queryGeneName, queryCovariateName), inputFile, tableWriter, row); + addRow(inputFile.readVariantGeneCovariateResults(queryVariantName, queryGeneName, queryCovariateName), inputFile, tableWriter, row, onlyOutputMetaZ); } else if (queryGeneName != null && queryVariantName != null) { for (Iterator iterator = inputFile.readVariantGeneResults(queryVariantName, queryGeneName); iterator.hasNext();) { - addRow(iterator.next(), inputFile, tableWriter, row); + addRow(iterator.next(), inputFile, tableWriter, row, onlyOutputMetaZ); } } else if (queryVariantName != null) { @@ -262,12 +413,12 @@ public static void main(String[] args) throws UnsupportedEncodingException, IOEx if (queryCovariateName != null) { if (inputFile.containsInteraction(queryVariantName, gene.getName(), queryCovariateName)) { - addRow(inputFile.readVariantGeneCovariateResults(queryVariantName, gene.getName(), queryCovariateName), inputFile, tableWriter, row); + addRow(inputFile.readVariantGeneCovariateResults(queryVariantName, gene.getName(), queryCovariateName), inputFile, tableWriter, row, onlyOutputMetaZ); } } else { for (Iterator iterator = inputFile.readVariantGeneResults(queryVariantName, gene.getName()); iterator.hasNext();) { - addRow(iterator.next(), inputFile, tableWriter, row); + addRow(iterator.next(), inputFile, tableWriter, row, onlyOutputMetaZ); } } @@ -282,12 +433,12 @@ public static void main(String[] args) throws UnsupportedEncodingException, IOEx if (queryCovariateName != null) { if (inputFile.containsInteraction(variant.getName(), queryGeneName, queryCovariateName)) { - addRow(inputFile.readVariantGeneCovariateResults(variant.getName(), queryGeneName, queryCovariateName), inputFile, tableWriter, row); + addRow(inputFile.readVariantGeneCovariateResults(variant.getName(), queryGeneName, queryCovariateName), inputFile, tableWriter, row, onlyOutputMetaZ); } } else { for (Iterator iterator = inputFile.readVariantGeneResults(variant.getName(), queryGeneName); iterator.hasNext();) { - addRow(iterator.next(), inputFile, tableWriter, row); + addRow(iterator.next(), inputFile, tableWriter, row, onlyOutputMetaZ); } } @@ -296,10 +447,10 @@ public static void main(String[] args) throws UnsupportedEncodingException, IOEx } else { - for (BinaryInteractionVariant variant : inputFile.getVariants()) { - + for (BinaryInteractionVariant variant : inputFile.getVariants()) { + String variantName = variant.getName(); - + int[] genePointers = inputFile.getVariant(variantName).getGenePointers(); for (int genePointer : genePointers) { @@ -307,12 +458,12 @@ public static void main(String[] args) throws UnsupportedEncodingException, IOEx if (queryCovariateName != null) { if (inputFile.containsInteraction(variantName, gene.getName(), queryCovariateName)) { - addRow(inputFile.readVariantGeneCovariateResults(variantName, gene.getName(), queryCovariateName), inputFile, tableWriter, row); + addRow(inputFile.readVariantGeneCovariateResults(variantName, gene.getName(), queryCovariateName), inputFile, tableWriter, row, onlyOutputMetaZ); } } else { for (Iterator iterator = inputFile.readVariantGeneResults(variantName, gene.getName()); iterator.hasNext();) { - addRow(iterator.next(), inputFile, tableWriter, row); + addRow(iterator.next(), inputFile, tableWriter, row, onlyOutputMetaZ); } } @@ -323,60 +474,149 @@ public static void main(String[] args) throws UnsupportedEncodingException, IOEx } + } - tableWriter.close(); - outputWriter.close(); + private static void doQueryCovariates(final HashSet queryCovariateNames, BinaryInteractionFile inputFile, CSVWriter tableWriter, String[] row, boolean onlyOutputMetaZ) throws IOException, BinaryInteractionFileException { + + for (BinaryInteractionVariant variant : inputFile.getVariants()) { + + String variantName = variant.getName(); + + int[] genePointers = inputFile.getVariant(variantName).getGenePointers(); + for (int genePointer : genePointers) { + + BinaryInteractionGene gene = inputFile.getGene(genePointer); + + for (Iterator iterator = inputFile.readVariantGeneResults(variantName, gene.getName()); iterator.hasNext();) { + BinaryInteractionQueryResult next = iterator.next(); + if(queryCovariateNames.contains(next.getCovariateName())){ + addRow(next, inputFile, tableWriter, row, onlyOutputMetaZ); + } + + } + + } + + } } - @SuppressWarnings({"null", "ConstantConditions"}) - private static void addRow(BinaryInteractionQueryResult queryRestult, BinaryInteractionFile inputFile, CSVWriter tableWriter, String[] row) throws BinaryInteractionFileException, IOException { - int c = 0; + private static Pair, Boolean> loadInteractionQueries(File queryFile) throws FileNotFoundException, IOException, Exception { - row[c++] = queryRestult.getVariantName(); - row[c++] = queryRestult.getGeneName(); - row[c++] = queryRestult.getCovariateName(); + LinkedHashSet interactionQueries = new LinkedHashSet(); + final CSVReader queryReader = new CSVReader(new FileReader(queryFile), '\t', '\0'); - BinaryInteractionVariant variant = inputFile.getVariant(queryRestult.getVariantName()); - row[c++] = variant.getChr(); - row[c++] = String.valueOf(variant.getPos()); - row[c++] = variant.getRefAllele().getAlleleAsString() + '/' + variant.getAltAllele().getAlleleAsString(); - row[c++] = variant.getAltAllele().toString(); + String[] nextLine = queryReader.readNext(); - BinaryInteractionQtlZscores zscroresQtl = queryRestult.getQtlZscores(); - BinaryInteractionZscores zscroresInteraction = queryRestult.getInteractionZscores(); + int variantCol = -1; + int geneCol = -1; + int covariateCol = -1; - for (int cohortIndex = 0; cohortIndex < inputFile.getCohortCount(); ++cohortIndex) { + //Parse header + for (int i = 0; i < nextLine.length; ++i) { + String headerEntry = nextLine[i].toLowerCase(); + switch (headerEntry) { + case "variant": + if (variantCol != -1) { + throw new Exception("Variant column found twice"); + } + variantCol = i; + break; + case "gene": + if (geneCol != -1) { + throw new Exception("Gene column found twice"); + } + geneCol = i; + break; + case "covariate": + if (covariateCol != -1) { + throw new Exception("Covariate column found twice"); + } + covariateCol = i; + break; - if (inputFile.isNormalQtlStored()) { - row[c++] = String.valueOf(zscroresQtl.getSampleCounts()[cohortIndex]); - row[c++] = String.valueOf(zscroresQtl.getZscores()[cohortIndex]); } - row[c++] = String.valueOf(zscroresInteraction.getSamplesInteractionCohort()[cohortIndex]); - row[c++] = String.valueOf(zscroresInteraction.getrSquaredCohort()[cohortIndex]); - row[c++] = String.valueOf(zscroresInteraction.getZscoreSnpCohort()[cohortIndex]); - row[c++] = String.valueOf(zscroresInteraction.getZscoreCovariateCohort()[cohortIndex]); - row[c++] = String.valueOf(zscroresInteraction.getZscoreInteractionCohort()[cohortIndex]); + } - if (inputFile.isFlippedZscoreStored()) { - row[c++] = String.valueOf(zscroresInteraction.getZscoreInteractionFlippedCohort()[cohortIndex]); - } + if (variantCol == -1 && geneCol == -1 && covariateCol == -1) { + throw new Exception("Did not detect appropiate header in query file"); } - if (inputFile.isMetaAnalysis()) { - if (inputFile.isNormalQtlStored()) { - row[c++] = String.valueOf(zscroresQtl.getMetaZscore()); + while ((nextLine = queryReader.readNext()) != null) { + String variant = null; + String gene = null; + String covariate = null; + + if (variantCol != -1) { + variant = nextLine[variantCol]; + } + if (geneCol != -1) { + gene = nextLine[geneCol]; } - row[c++] = String.valueOf(zscroresInteraction.getZscoreSnpMeta()); - row[c++] = String.valueOf(zscroresInteraction.getZscoreCovariateMeta()); - row[c++] = String.valueOf(zscroresInteraction.getZscoreInteractionMeta()); - if (inputFile.isFlippedZscoreStored()) { - row[c++] = String.valueOf(zscroresInteraction.getZscoreInteractionFlippedMeta()); + if (covariateCol != -1) { + covariate = nextLine[covariateCol]; } + interactionQueries.add(new InteractoinQuery(variant, gene, covariate)); } + queryReader.close(); - tableWriter.writeNext(row); + return new Pair(interactionQueries, variantCol == -1 && geneCol == -1); + } + + private static class InteractoinQuery { + + private final String variant; + private final String gene; + private final String covariate; + + public InteractoinQuery(String variant, String gene, String covariate) { + this.variant = variant; + this.gene = gene; + this.covariate = covariate; + } + + public String getVariant() { + return variant; + } + + public String getGene() { + return gene; + } + + public String getCovariate() { + return covariate; + } + + @Override + public int hashCode() { + int hash = 5; + hash = 67 * hash + (this.variant != null ? this.variant.hashCode() : 0); + hash = 67 * hash + (this.gene != null ? this.gene.hashCode() : 0); + hash = 67 * hash + (this.covariate != null ? this.covariate.hashCode() : 0); + return hash; + } + + @Override + public boolean equals(Object obj) { + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final InteractoinQuery other = (InteractoinQuery) obj; + if ((this.variant == null) ? (other.variant != null) : !this.variant.equals(other.variant)) { + return false; + } + if ((this.gene == null) ? (other.gene != null) : !this.gene.equals(other.gene)) { + return false; + } + if ((this.covariate == null) ? (other.covariate != null) : !this.covariate.equals(other.covariate)) { + return false; + } + return true; + } } } diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/ReplicateInteractions.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/ReplicateInteractions.java index 7f7ae2086..295f00063 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/ReplicateInteractions.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binaryInteraction/ReplicateInteractions.java @@ -1,6 +1,7 @@ package eqtlmappingpipeline.binaryInteraction; import au.com.bytecode.opencsv.CSVWriter; +import eqtlmappingpipeline.Main; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; @@ -9,15 +10,14 @@ import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; -import java.text.DateFormat; +import java.io.Writer; import java.text.NumberFormat; -import java.text.SimpleDateFormat; -import java.util.Date; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.Map; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; -import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; @@ -77,6 +77,20 @@ public class ReplicateInteractions { OptionBuilder.isRequired(); OPTIONS.addOption(OptionBuilder.create("riz")); + OptionBuilder.withArgName("double"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Minimum absolute interaction z-score to count covariate"); + OptionBuilder.withLongOpt("covariateInteractionZ"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("ciz")); + + OptionBuilder.withArgName("double"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Minimum absolute replication interaction z-score to count covariate"); + OptionBuilder.withLongOpt("covariateReplicationInteractionZ"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("criz")); + OptionBuilder.withDescription("If set match variant on chr-pos"); OptionBuilder.withLongOpt("chrPos"); OPTIONS.addOption(OptionBuilder.create("cp")); @@ -87,6 +101,24 @@ public class ReplicateInteractions { OptionBuilder.withLongOpt("covariats"); OPTIONS.addOption(OptionBuilder.create("c")); + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("File with eQTL genes to include in analysis"); + OptionBuilder.withLongOpt("genes"); + OPTIONS.addOption(OptionBuilder.create("g")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("File with covariates to test for replication"); + OptionBuilder.withLongOpt("covariatsReplication"); + OPTIONS.addOption(OptionBuilder.create("cr")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("File with eQTL genes to test for replication"); + OptionBuilder.withLongOpt("genesReplication"); + OPTIONS.addOption(OptionBuilder.create("gr")); + } public static void main(String[] args) throws FileNotFoundException, IOException, BinaryInteractionFileException { @@ -95,9 +127,14 @@ public static void main(String[] args) throws FileNotFoundException, IOException final File replicationInteractionFile; final double minAbsInteractionZ; final double minAbsReplicationInteractionZ; + final double minAbsInteractionZCovariateCount; + final double minAbsReplicationInteractionZCovariateCount; final boolean matchOnChrPos; final String outputPrefix; final File covariatesToIncludeFile; + final File genesToIncludeFile; + final File covariatesReplicationToIncludeFile; + final File genesReplicationToIncludeFile; try { final CommandLine commandLine = new PosixParser().parse(OPTIONS, args, false); @@ -122,12 +159,46 @@ public static void main(String[] args) throws FileNotFoundException, IOException return; } + try { + minAbsInteractionZCovariateCount = Double.parseDouble(commandLine.getOptionValue("ciz")); + } catch (NumberFormatException ex) { + System.out.println("Cannot not parse --covariateInteractionZ as double: " + commandLine.getOptionValue("ciz")); + System.exit(1); + return; + } + + try { + minAbsReplicationInteractionZCovariateCount = Double.parseDouble(commandLine.getOptionValue("criz")); + } catch (NumberFormatException ex) { + System.out.println("Cannot not parse --covariateReplicationInteractionZ as double: " + commandLine.getOptionValue("criz")); + System.exit(1); + return; + } + if (commandLine.hasOption("c")) { covariatesToIncludeFile = new File(commandLine.getOptionValue("c")); } else { covariatesToIncludeFile = null; } + if (commandLine.hasOption("g")) { + genesToIncludeFile = new File(commandLine.getOptionValue("g")); + } else { + genesToIncludeFile = null; + } + + if (commandLine.hasOption("cr")) { + covariatesReplicationToIncludeFile = new File(commandLine.getOptionValue("cr")); + } else { + covariatesReplicationToIncludeFile = null; + } + + if (commandLine.hasOption("gr")) { + genesReplicationToIncludeFile = new File(commandLine.getOptionValue("gr")); + } else { + genesReplicationToIncludeFile = null; + } + matchOnChrPos = commandLine.hasOption("cp"); } catch (ParseException ex) { @@ -138,34 +209,91 @@ public static void main(String[] args) throws FileNotFoundException, IOException System.exit(1); return; } - - System.out.println("Input file: " + inputInteractionFile.getAbsolutePath()); - System.out.println("Replication file: " + replicationInteractionFile.getAbsolutePath()); - System.out.println("Output prefix: " + outputPrefix); - System.out.println("Min interaction z-score: " + minAbsInteractionZ); - System.out.println("Min replication interaction z-score: " + minAbsReplicationInteractionZ); + BufferedWriter logWriter = new BufferedWriter(new FileWriter(outputPrefix + "_Log.txt")); + + writeAndOut("Software version: " + Main.VERSION, logWriter); + writeAndOut("Input file: " + inputInteractionFile.getAbsolutePath(), logWriter); + writeAndOut("Replication file: " + replicationInteractionFile.getAbsolutePath(), logWriter); + writeAndOut("Output prefix: " + outputPrefix, logWriter); + writeAndOut("Min interaction z-score: " + minAbsInteractionZ, logWriter); + writeAndOut("Min replication interaction z-score: " + minAbsReplicationInteractionZ, logWriter); + writeAndOut("Min interaction z-score covariate counter: " + minAbsInteractionZCovariateCount, logWriter); + writeAndOut("Min replication interaction z-score covariate counter: " + minAbsReplicationInteractionZCovariateCount, logWriter); if (matchOnChrPos) { - System.out.println("Matching variants on chr-pos"); + writeAndOut("Matching variants on chr-pos", logWriter); } if (covariatesToIncludeFile != null) { - System.out.println("Covariates to include: " + covariatesToIncludeFile.getAbsolutePath()); + writeAndOut("Covariates to include: " + covariatesToIncludeFile.getAbsolutePath(), logWriter); + } + if (genesToIncludeFile != null) { + writeAndOut("eQTL genes to include: " + genesToIncludeFile.getAbsolutePath(), logWriter); } - System.out.println(); + if (covariatesReplicationToIncludeFile != null) { + writeAndOut("Covariates replication to include: " + covariatesReplicationToIncludeFile.getAbsolutePath(), logWriter); + } + if (genesReplicationToIncludeFile != null) { + writeAndOut("eQTL genes replication to include: " + genesReplicationToIncludeFile.getAbsolutePath(), logWriter); + } + + writeAndOut("", logWriter); + final HashSet covariantsToInclude; if (covariatesToIncludeFile != null) { covariantsToInclude = new HashSet(); BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(covariatesToIncludeFile), "UTF-8")); - String line; + String line; while ((line = reader.readLine()) != null) { covariantsToInclude.add(line.trim()); } - System.out.println("Covariates included: " + covariantsToInclude.size()); - System.out.println(); + writeAndOut("Covariates included: " + covariantsToInclude.size(), logWriter); + writeAndOut("", logWriter); } else { covariantsToInclude = null; } + final HashSet genesToInclude; + if (genesToIncludeFile != null) { + genesToInclude = new HashSet(); + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(genesToIncludeFile), "UTF-8")); + String line; + while ((line = reader.readLine()) != null) { + genesToInclude.add(line.trim()); + } + writeAndOut("eQTL genes included: " + genesToInclude.size(), logWriter); + writeAndOut("", logWriter); + } else { + genesToInclude = null; + } + + final HashSet covariantsReplicationToInclude; + if (covariatesReplicationToIncludeFile != null) { + covariantsReplicationToInclude = new HashSet(); + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(covariatesReplicationToIncludeFile), "UTF-8")); + String line; + while ((line = reader.readLine()) != null) { + covariantsReplicationToInclude.add(line.trim()); + } + writeAndOut("Covariates replication included: " + covariantsReplicationToInclude.size(), logWriter); + writeAndOut("", logWriter); + } else { + covariantsReplicationToInclude = null; + } + + final HashSet genesReplicationToInclude; + if (genesReplicationToIncludeFile != null) { + genesReplicationToInclude = new HashSet(); + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(genesReplicationToIncludeFile), "UTF-8")); + String line; + while ((line = reader.readLine()) != null) { + genesReplicationToInclude.add(line.trim()); + } + writeAndOut("eQTL genes replication included: " + genesReplicationToInclude.size(), logWriter); + writeAndOut("", logWriter); + } else { + genesReplicationToInclude = null; + } + BinaryInteractionFile inputFile = BinaryInteractionFile.load(inputInteractionFile, true); BinaryInteractionFile replicationFile = BinaryInteractionFile.load(replicationInteractionFile, true); @@ -175,10 +303,12 @@ public static void main(String[] args) throws FileNotFoundException, IOException CSVWriter replicatedOppositeDirectionWriter = writeHeader(new File(outputPrefix + "_ReplicatedOppositeDirection.txt"), row); CSVWriter notReplicatedSameDirectionWriter = writeHeader(new File(outputPrefix + "_NotReplicatedSameDirection.txt"), row); CSVWriter notReplicatedOppositeDirectionWriter = writeHeader(new File(outputPrefix + "_NotReplicatedOppositeDirection.txt"), row); + CSVWriter notInReplicationWriter = writeHeader(new File(outputPrefix + "_NotInReplication.txt"), row); int significant = 0; int notSignificant = 0; int notTestedInReplication = 0; + int nanReplication = 0; int notSignificantReplicationSameDirection = 0; int notSignificantReplicationOppositeDirection = 0; int significantReplicationOppositeDirection = 0; @@ -186,11 +316,17 @@ public static void main(String[] args) throws FileNotFoundException, IOException int reporter = 0; + LinkedHashMap covariateCounts = new LinkedHashMap(inputFile.getCovariateCount()); + for (String covariate : inputFile.getCovariates()) { + covariateCounts.put(covariate, new CovariateCount()); + } + for (BinaryInteractionVariant variant : inputFile.getVariants()) { String variantName = variant.getName(); BinaryInteractionVariant replicationVariant; + boolean swap; if (matchOnChrPos) { replicationVariant = replicationFile.getVariant(variant.getChr(), variant.getPos()); @@ -201,76 +337,131 @@ public static void main(String[] args) throws FileNotFoundException, IOException replicationVariant = null; } } + + if (replicationVariant != null) { + if (!(variant.getRefAllele() == replicationVariant.getRefAllele() && variant.getAltAllele() == replicationVariant.getAltAllele()) + && !(variant.getRefAllele() == replicationVariant.getAltAllele() && variant.getAltAllele() == replicationVariant.getRefAllele())) { + System.err.println("Allele mismatch!"); + } + swap = variant.getAltAllele() != replicationVariant.getAltAllele(); + } else { + swap = false; + } + //Do loop anyway to also count not replicated int[] genePointers = inputFile.getVariant(variantName).getGenePointers(); + genes: for (int genePointer : genePointers) { BinaryInteractionGene gene = inputFile.getGene(genePointer); + if (genesToInclude != null && !genesToInclude.contains(gene.getName())) { + continue genes; + } covairates: for (Iterator iterator = inputFile.readVariantGeneResults(variantName, gene.getName()); iterator.hasNext();) { - BinaryInteractionQueryResult interation = iterator.next(); + BinaryInteractionQueryResult interaction = iterator.next(); - if (covariantsToInclude != null && !covariantsToInclude.contains(interation.getCovariateName())) { + if (covariantsToInclude != null && !covariantsToInclude.contains(interaction.getCovariateName())) { continue covairates; } - double metaInteractionZ = interation.getInteractionZscores().getZscoreInteractionMeta(); + double metaInteractionZ = interaction.getInteractionZscores().getZscoreInteractionMeta(); if (metaInteractionZ >= minAbsInteractionZ || metaInteractionZ <= -minAbsInteractionZ) { ++significant; - if (replicationVariant != null && replicationFile.containsInteraction(replicationVariant.getName(), gene.getName(), interation.getCovariateName())) { - - if (!(variant.getRefAllele() == replicationVariant.getRefAllele() && variant.getAltAllele() == replicationVariant.getAltAllele()) - && !(variant.getRefAllele() == replicationVariant.getAltAllele() && variant.getAltAllele() == replicationVariant.getRefAllele())) { - System.err.println("Allele mismatch!"); - continue covairates; - } + if (replicationVariant != null && replicationFile.containsInteraction(replicationVariant.getName(), gene.getName(), interaction.getCovariateName()) && (genesReplicationToInclude == null || genesReplicationToInclude.contains(interaction.getGeneName()) && (covariantsReplicationToInclude == null || covariantsToInclude.contains(interaction.getCovariateName())) )) { - BinaryInteractionZscores replicationZscores = replicationFile.readInteractionResults(replicationVariant.getName(), gene.getName(), interation.getCovariateName()); + BinaryInteractionZscores replicationZscores = replicationFile.readInteractionResults(replicationVariant.getName(), gene.getName(), interaction.getCovariateName()); double replicationInteractionZscore = replicationZscores.getZscoreInteractionMeta(); - boolean swap = variant.getAltAllele() != replicationVariant.getAltAllele(); - BinaryInteractionQtlZscores replicationQtlRes = replicationFile.readQtlResults(replicationVariant.getName(), gene.getName()); - if (swap) { - replicationInteractionZscore *= -1; - } + if (!Double.isNaN(replicationInteractionZscore)) { - if (replicationInteractionZscore <= -minAbsReplicationInteractionZ || replicationInteractionZscore >= minAbsReplicationInteractionZ) { - if (metaInteractionZ * replicationInteractionZscore >= 0) { - ++significantReplicationSameDirection; + if (swap) { + replicationInteractionZscore *= -1; + } + if (replicationInteractionZscore <= -minAbsReplicationInteractionZ || replicationInteractionZscore >= minAbsReplicationInteractionZ) { + if (metaInteractionZ * replicationInteractionZscore >= 0) { + ++significantReplicationSameDirection; - writeInteraction(row, variantName, gene, interation, variant, replicationQtlRes, replicationZscores, swap, replicatedSameDirectionWriter); - } else { - ++significantReplicationOppositeDirection; + writeInteraction(row, variantName, gene, interaction, variant, replicationQtlRes, replicationZscores, swap, replicatedSameDirectionWriter); - writeInteraction(row, variantName, gene, interation, variant, replicationQtlRes, replicationZscores, swap, replicatedOppositeDirectionWriter); - } - } else { - if (metaInteractionZ * replicationInteractionZscore >= 0) { - ++notSignificantReplicationSameDirection; - writeInteraction(row, variantName, gene, interation, variant, replicationQtlRes, replicationZscores, swap, notReplicatedSameDirectionWriter); + } else { + ++significantReplicationOppositeDirection; + + writeInteraction(row, variantName, gene, interaction, variant, replicationQtlRes, replicationZscores, swap, replicatedOppositeDirectionWriter); + } } else { - ++notSignificantReplicationOppositeDirection; - writeInteraction(row, variantName, gene, interation, variant, replicationQtlRes, replicationZscores, swap, notReplicatedOppositeDirectionWriter); + if (metaInteractionZ * replicationInteractionZscore >= 0) { + ++notSignificantReplicationSameDirection; + writeInteraction(row, variantName, gene, interaction, variant, replicationQtlRes, replicationZscores, swap, notReplicatedSameDirectionWriter); + } else { + ++notSignificantReplicationOppositeDirection; + writeInteraction(row, variantName, gene, interaction, variant, replicationQtlRes, replicationZscores, swap, notReplicatedOppositeDirectionWriter); + } } + } else { + writeInteraction(row, variantName, gene, interaction, variant, replicationQtlRes, replicationZscores, swap, notInReplicationWriter); + ++nanReplication; } - } else { + writeInteraction(row, variantName, gene, interaction, variant, null, null, swap, notInReplicationWriter); ++notTestedInReplication; } } else { ++notSignificant; } + + if (metaInteractionZ >= minAbsInteractionZCovariateCount || metaInteractionZ <= -minAbsInteractionZCovariateCount) { + + CovariateCount thisCovariateCounts = covariateCounts.get(interaction.getCovariateName()); + thisCovariateCounts.incrementCovariateSignificant(); + + if (replicationVariant != null && replicationFile.containsInteraction(replicationVariant.getName(), gene.getName(), interaction.getCovariateName())) { + + BinaryInteractionZscores replicationZscores = replicationFile.readInteractionResults(replicationVariant.getName(), gene.getName(), interaction.getCovariateName()); + double replicationInteractionZscore = replicationZscores.getZscoreInteractionMeta(); + + if (!Double.isNaN(replicationInteractionZscore)) { + + if (swap) { + replicationInteractionZscore *= -1; + } + + if (replicationInteractionZscore <= -minAbsReplicationInteractionZCovariateCount || replicationInteractionZscore >= minAbsReplicationInteractionZCovariateCount) { + if (metaInteractionZ * replicationInteractionZscore >= 0) { + thisCovariateCounts.incrementReplicatedSameDirection(); + + } else { + thisCovariateCounts.incrementReplicatedOppositeDirection(); + } + } else { + if (metaInteractionZ * replicationInteractionZscore >= 0) { + thisCovariateCounts.incrementNotReplicatedSameDirection(); + } else { + thisCovariateCounts.incrementNotReplicatedOppositeDirection(); + } + } + + } else { + } + + } else { + } + + } else { + } + + } ++reporter; @@ -286,22 +477,28 @@ public static void main(String[] args) throws FileNotFoundException, IOException replicatedOppositeDirectionWriter.close(); notReplicatedSameDirectionWriter.close(); notReplicatedOppositeDirectionWriter.close(); + notInReplicationWriter.close(); + + writeCovaraiteCounts(new File(outputPrefix + "_CovariateCounts.txt"), covariateCounts); NumberFormat numberFormat = NumberFormat.getInstance(); numberFormat.setMinimumFractionDigits(0); numberFormat.setMaximumFractionDigits(2); - System.out.println(""); - System.out.println("Total number of interactions: " + numberFormat.format(notSignificant + significant)); - System.out.println(" - Not significant: " + numberFormat.format(notSignificant) + " (" + numberFormat.format(notSignificant * 100d / (notSignificant + significant)) + "%)"); - System.out.println(" - Significant: " + numberFormat.format(significant) + " (" + numberFormat.format(significant * 100d / (notSignificant + significant)) + "%)"); - System.out.println(" * Not in replication: " + numberFormat.format(notTestedInReplication) + " (" + numberFormat.format(notTestedInReplication * 100d / significant) + "%)"); - System.out.println(" * Not significant in replication: " + numberFormat.format(notSignificantReplicationSameDirection + notSignificantReplicationOppositeDirection) + " (" + numberFormat.format((notSignificantReplicationSameDirection + notSignificantReplicationOppositeDirection) * 100d / significant) + "%)"); - System.out.println(" # Same direction: " + numberFormat.format(notSignificantReplicationSameDirection) + " (" + numberFormat.format(notSignificantReplicationSameDirection * 100d / (notSignificantReplicationSameDirection + notSignificantReplicationOppositeDirection)) + "%)"); - System.out.println(" # Opposite direction: " + numberFormat.format(notSignificantReplicationOppositeDirection) + " (" + numberFormat.format(notSignificantReplicationOppositeDirection * 100d / (notSignificantReplicationSameDirection + notSignificantReplicationOppositeDirection)) + "%)"); - System.out.println(" * Significant in replication: " + numberFormat.format(significantReplicationSameDirection + significantReplicationOppositeDirection) + " (" + numberFormat.format((significantReplicationSameDirection + significantReplicationOppositeDirection) * 100d / significant) + "%)"); - System.out.println(" # Same direction: " + numberFormat.format(significantReplicationSameDirection) + " (" + numberFormat.format(significantReplicationSameDirection * 100d / (significantReplicationSameDirection + significantReplicationOppositeDirection)) + "%)"); - System.out.println(" # Opposite direction: " + numberFormat.format(significantReplicationOppositeDirection) + " (" + numberFormat.format(significantReplicationOppositeDirection * 100d / (significantReplicationSameDirection + significantReplicationOppositeDirection)) + "%)"); + writeAndOut("", logWriter); + writeAndOut("Total number of interactions: " + numberFormat.format(notSignificant + significant), logWriter); + writeAndOut(" - Not significant: " + numberFormat.format(notSignificant) + " (" + numberFormat.format(notSignificant * 100d / (notSignificant + significant)) + "%)", logWriter); + writeAndOut(" - Significant: " + numberFormat.format(significant) + " (" + numberFormat.format(significant * 100d / (notSignificant + significant)) + "%)", logWriter); + writeAndOut(" * Not in replication: " + numberFormat.format(notTestedInReplication) + " (" + numberFormat.format(notTestedInReplication * 100d / significant) + "%)", logWriter); + writeAndOut(" * NaN in replication: " + numberFormat.format(nanReplication) + " (" + numberFormat.format(nanReplication * 100d / significant) + "%)", logWriter); + writeAndOut(" * Not significant in replication: " + numberFormat.format(notSignificantReplicationSameDirection + notSignificantReplicationOppositeDirection) + " (" + numberFormat.format((notSignificantReplicationSameDirection + notSignificantReplicationOppositeDirection) * 100d / significant) + "%)", logWriter); + writeAndOut(" # Same direction: " + numberFormat.format(notSignificantReplicationSameDirection) + " (" + numberFormat.format(notSignificantReplicationSameDirection * 100d / (notSignificantReplicationSameDirection + notSignificantReplicationOppositeDirection)) + "%)", logWriter); + writeAndOut(" # Opposite direction: " + numberFormat.format(notSignificantReplicationOppositeDirection) + " (" + numberFormat.format(notSignificantReplicationOppositeDirection * 100d / (notSignificantReplicationSameDirection + notSignificantReplicationOppositeDirection)) + "%)", logWriter); + writeAndOut(" * Significant in replication: " + numberFormat.format(significantReplicationSameDirection + significantReplicationOppositeDirection) + " (" + numberFormat.format((significantReplicationSameDirection + significantReplicationOppositeDirection) * 100d / significant) + "%)", logWriter); + writeAndOut(" # Same direction: " + numberFormat.format(significantReplicationSameDirection) + " (" + numberFormat.format(significantReplicationSameDirection * 100d / (significantReplicationSameDirection + significantReplicationOppositeDirection)) + "%)", logWriter); + writeAndOut(" # Opposite direction: " + numberFormat.format(significantReplicationOppositeDirection) + " (" + numberFormat.format(significantReplicationOppositeDirection * 100d / (significantReplicationSameDirection + significantReplicationOppositeDirection)) + "%)", logWriter); + + logWriter.close(); } @@ -318,10 +515,10 @@ private static void writeInteraction(String[] row, String variantName, BinaryInt row[c++] = String.valueOf(interation.getInteractionZscores().getZscoreSnpMeta()); row[c++] = String.valueOf(interation.getInteractionZscores().getZscoreCovariateMeta()); row[c++] = String.valueOf(interation.getInteractionZscores().getZscoreInteractionMeta()); - row[c++] = String.valueOf(replicationQtlRes.getMetaZscore() * (swap ? -1 : 1)); - row[c++] = String.valueOf(replicationZscores.getZscoreSnpMeta() * (swap ? -1 : 1)); - row[c++] = String.valueOf(replicationZscores.getZscoreCovariateMeta()); - row[c++] = String.valueOf(replicationZscores.getZscoreInteractionMeta() * (swap ? -1 : 1)); + row[c++] = replicationQtlRes == null ? "NaN" : String.valueOf(replicationQtlRes.getMetaZscore() * (swap ? -1 : 1)); + row[c++] = replicationZscores == null ? "NaN" : String.valueOf(replicationZscores.getZscoreSnpMeta() * (swap ? -1 : 1)); + row[c++] = replicationZscores == null ? "NaN" : String.valueOf(replicationZscores.getZscoreCovariateMeta()); + row[c++] = replicationZscores == null ? "NaN" : String.valueOf(replicationZscores.getZscoreInteractionMeta() * (swap ? -1 : 1)); interactionWriter.writeNext(row); } @@ -346,4 +543,91 @@ private static CSVWriter writeHeader(File file, String[] row) throws IOException replicatedSameDirectionWriter.writeNext(row); return replicatedSameDirectionWriter; } + + private static void writeCovaraiteCounts(File file, LinkedHashMap covariateCounts) throws IOException { + + CSVWriter covariateCountWriter = new CSVWriter(new BufferedWriter(new FileWriter(file)), '\t', '\0', '\0'); + int c = 0; + String[] row2 = new String[6]; + row2[c++] = "Covariate"; + row2[c++] = "Significant"; + row2[c++] = "ReplicatedSameDirection"; + row2[c++] = "ReplicatedOppositeDirection"; + row2[c++] = "NotReplicateSameDirection"; + row2[c++] = "NotReplicatedOppositeDirection"; + covariateCountWriter.writeNext(row2); + + for (Map.Entry covariateEntry : covariateCounts.entrySet()) { + + CovariateCount thisCounts = covariateEntry.getValue(); + + c = 0; + row2[c++] = covariateEntry.getKey(); + row2[c++] = String.valueOf(thisCounts.getCovariateSignificant()); + row2[c++] = String.valueOf(thisCounts.getReplicatedSameDirection()); + row2[c++] = String.valueOf(thisCounts.getReplicatedOppositeDirection()); + row2[c++] = String.valueOf(thisCounts.getNotReplicatedSameDirection()); + row2[c++] = String.valueOf(thisCounts.getNotReplicatedOppositeDirection()); + covariateCountWriter.writeNext(row2); + + } + + covariateCountWriter.close(); + + } + + private static void writeAndOut(String message, Writer writer) throws IOException { + writer.append(message); + writer.append('\n'); + System.out.println(message); + } + + private static class CovariateCount { + + private int covariateSignificant = 0; + private int replicatedSameDirection = 0; + private int replicatedOppositeDirection = 0; + private int notReplicatedSameDirection = 0; + private int notReplicatedOppositeDirection = 0; + + public int getCovariateSignificant() { + return covariateSignificant; + } + + public int getReplicatedSameDirection() { + return replicatedSameDirection; + } + + public int getReplicatedOppositeDirection() { + return replicatedOppositeDirection; + } + + public int getNotReplicatedSameDirection() { + return notReplicatedSameDirection; + } + + public int getNotReplicatedOppositeDirection() { + return notReplicatedOppositeDirection; + } + + public void incrementCovariateSignificant() { + covariateSignificant++; + } + + public void incrementReplicatedSameDirection() { + replicatedSameDirection++; + } + + public void incrementReplicatedOppositeDirection() { + replicatedOppositeDirection++; + } + + public void incrementNotReplicatedSameDirection() { + notReplicatedSameDirection++; + } + + public void incrementNotReplicatedOppositeDirection() { + notReplicatedOppositeDirection++; + } + } } diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/Main.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/Main.java index dddf09892..0587461d3 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/Main.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/Main.java @@ -82,8 +82,8 @@ public static void main(String[] args) { try { MetaAnalyze m2 = new MetaAnalyze(); - m2.init(settings, texttoreplace, replacetextwith); - m2.analyze(); + m2.init(settings, texttoreplace, replacetextwith); + m2.analyze(); // System.gc(); // System.gc(); diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaAnalysisCalculationThread.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaAnalysisCalculationThread.java index dce25b347..154e7f17d 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaAnalysisCalculationThread.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaAnalysisCalculationThread.java @@ -4,8 +4,14 @@ */ package eqtlmappingpipeline.binarymeta.meta; -import umcg.genetica.io.trityper.probeannotation.ProbeTranslation; import eqtlmappingpipeline.binarymeta.meta.graphics.ZScorePlot; +import umcg.genetica.io.trityper.EQTL; +import umcg.genetica.io.trityper.bin.BinaryResultDataset; +import umcg.genetica.io.trityper.bin.BinaryResultSNP; +import umcg.genetica.io.trityper.probeannotation.ProbeTranslation; +import umcg.genetica.io.trityper.util.BaseAnnot; +import umcg.genetica.math.stats.Descriptives; + import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.HashSet; @@ -14,175 +20,167 @@ import java.util.logging.Logger; import java.util.zip.DataFormatException; import java.util.zip.Inflater; -import umcg.genetica.io.trityper.bin.BinaryResultSNP; -import umcg.genetica.math.stats.Descriptives; - -import umcg.genetica.io.trityper.EQTL; -import umcg.genetica.io.trityper.bin.BinaryResultDataset; -import umcg.genetica.io.trityper.util.BaseAnnot; /** - * * @author harmjan */ public class MetaAnalysisCalculationThread extends Thread { - protected LinkedBlockingQueue m_queue_input; - protected LinkedBlockingQueue m_queue_output; - protected ArrayList probes; - protected ArrayList snps; - protected ArrayList snpChr; - protected ArrayList snpChrPos; - protected BinaryResultDataset[] ds; - protected Integer[][] snpTranslation; - protected Integer[][] probeTranslationLookupTable; - protected ProbeTranslation probeTranslation; - protected MetaSettings m_settings; - protected ZScorePlot zs; - protected Inflater inflater = new Inflater(); - protected PValueThreshold pvaluethreshold; - private int numEffects = 0; - private int numSNPs = 0; - - public MetaAnalysisCalculationThread(LinkedBlockingQueue input, LinkedBlockingQueue output, - ArrayList snps, ArrayList probes, - ArrayList snpChr, ArrayList snpChrPos, - BinaryResultDataset[] ds, - Integer[][] snpTranslation, - Integer[][] probeTranslationLookupTable, ProbeTranslation probeTranslation, - MetaSettings m_settings, - ZScorePlot zs, PValueThreshold p) { - this.probes = probes; - this.snps = snps; - this.snpChr = snpChr; - this.snpChrPos = snpChrPos; - this.snpTranslation = snpTranslation; - this.ds = ds; - this.probeTranslation = probeTranslation; - this.probeTranslationLookupTable = probeTranslationLookupTable; - this.m_settings = m_settings; - this.zs = zs; - this.pvaluethreshold = p; - m_queue_input = input; - m_queue_output = output; - } - - @Override - public void run() { - boolean poison = false; - while (!poison) { - try { - MetaAnalysisWorkPackage pack = m_queue_input.take(); - poison = pack.getPoison(); - if (!poison) { - analyze(pack); + protected LinkedBlockingQueue m_queue_input; + protected LinkedBlockingQueue m_queue_output; + protected ArrayList probes; + protected ArrayList snps; + protected ArrayList snpChr; + protected ArrayList snpChrPos; + protected BinaryResultDataset[] ds; + protected Integer[][] snpTranslation; + protected Integer[][] probeTranslationLookupTable; + protected ProbeTranslation probeTranslation; + protected MetaSettings m_settings; + protected ZScorePlot zs; + protected Inflater inflater = new Inflater(); + protected PValueThreshold pvaluethreshold; + private int numEffects = 0; + private int numSNPs = 0; + + public MetaAnalysisCalculationThread(LinkedBlockingQueue input, LinkedBlockingQueue output, + ArrayList snps, ArrayList probes, + ArrayList snpChr, ArrayList snpChrPos, + BinaryResultDataset[] ds, + Integer[][] snpTranslation, + Integer[][] probeTranslationLookupTable, ProbeTranslation probeTranslation, + MetaSettings m_settings, + ZScorePlot zs, PValueThreshold p) { + this.probes = probes; + this.snps = snps; + this.snpChr = snpChr; + this.snpChrPos = snpChrPos; + this.snpTranslation = snpTranslation; + this.ds = ds; + this.probeTranslation = probeTranslation; + this.probeTranslationLookupTable = probeTranslationLookupTable; + this.m_settings = m_settings; + this.zs = zs; + this.pvaluethreshold = p; + m_queue_input = input; + m_queue_output = output; + } + + @Override + public void run() { + boolean poison = false; + while (!poison) { + try { + MetaAnalysisWorkPackage pack = m_queue_input.take(); + poison = pack.getPoison(); + if (!poison) { + analyze(pack); // if(taken % printperiterations == 0){ // System.out.println("Thread "+this.getName()+" calculated "+taken+" workpackages."); // } - } + } - } catch (InterruptedException ex) { - ex.printStackTrace(); - } - } + } catch (InterruptedException ex) { + ex.printStackTrace(); + } + } - System.out.println(this.getName() + " - Poisoned - Num tests passed QC: " + numEffects + "\t" + numSNPs); - } + System.out.println(this.getName() + " - Poisoned - Num tests passed QC: " + numEffects + "\t" + numSNPs); + } - protected void analyze(MetaAnalysisWorkPackage pack) { + protected void analyze(MetaAnalysisWorkPackage pack) { - int s = pack.getSNPNum(); + int s = pack.getSNPNum(); - // DEBUG + // DEBUG // boolean verbose = false; // if (snps.get(s).equals("rs6919346")) { // verbose = true; // } - int[] totalNrSamples = new int[probes.size()]; - double[] zSum = new double[probes.size()]; - double[] zSumAbsolute = new double[probes.size()]; - int[] dsPassQC = new int[probes.size()]; - Result r = new Result(); - r.finalzscores = new Double[probes.size()]; - r.finalpvalues = new Double[probes.size()]; - r.numSamples = new Integer[probes.size()][ds.length]; - r.datasetZScores = new Double[probes.size()][ds.length]; - r.dspassingqc = new boolean[probes.size()][ds.length]; - r.snp = s; - r.passesQC = true; - r.datasets = new String[ds.length]; - boolean[] zscoreflipped = new boolean[ds.length]; - EQTL[] result = new EQTL[probes.size()]; - - BinaryResultSNP firstSNPPassingQC = null; - - Byte snpchr = snpChr.get(s); - Integer snpchrpos = snpChrPos.get(s); - boolean snphaspropermapping = true; - if (snpchr == null || snpchrpos == null || snpchr == -1) { - snpchr = -1; - snpchrpos = -1; - snphaspropermapping = false; - } - - StringBuilder zscoretableout = new StringBuilder(); - - int numDSPassingQC = 0; - - HashSet probesTestedHash = new HashSet(); - boolean[] testprobes = new boolean[probes.size()]; - for (int p = 0; p < probes.size(); p++) { - byte probechr = probeTranslation.getProbeChr(p); - int probechrpos = probeTranslation.getProbeChrPos(p); - boolean testprobe = false; - - if (m_settings.isCis() && m_settings.isTrans()) { - testprobe = true; - } else if (m_settings.isCis() && !m_settings.isTrans()) { - if (snpchr < 1 || probechr < 1) { - testprobe = false; - } else if (probechr == snpchr) { - if (Math.abs(snpchrpos - probechrpos) < m_settings.getCisdistance()) { - testprobe = true; - } else { - testprobe = false; - } - } else { - testprobe = false; - } - } else if (!m_settings.isCis() && m_settings.isTrans()) { - if (snpchr < 1 || probechr < 1) { - testprobe = false; - } else if (probechr == snpchr) { - if (Math.abs(snpchrpos - probechrpos) > m_settings.getTransdistance()) { - testprobe = true; - } else { - testprobe = false; - } - } else { - testprobe = true; - } - } - testprobes[p] = testprobe; - - if (testprobe) { - probesTestedHash.add(p); - } - } - - - - for (int d = 0; d < ds.length; d++) { - - Integer snpId = snpTranslation[d][s]; - - if (snpId != null) { - - BinaryResultSNP snpObject = pack.getSNPObject(d); // ds[d].getSnps()[snpId]; + int[] totalNrSamples = new int[probes.size()]; + double[] zSum = new double[probes.size()]; + double[] zSumAbsolute = new double[probes.size()]; + int[] dsPassQC = new int[probes.size()]; + Result r = new Result(); + r.finalzscores = new Double[probes.size()]; + r.finalpvalues = new Double[probes.size()]; + r.numSamples = new Integer[probes.size()][ds.length]; + r.datasetZScores = new Double[probes.size()][ds.length]; + r.dspassingqc = new boolean[probes.size()][ds.length]; + r.snp = s; + r.passesQC = true; + r.datasets = new String[ds.length]; + boolean[] zscoreflipped = new boolean[ds.length]; + EQTL[] result = new EQTL[probes.size()]; + + BinaryResultSNP firstSNPPassingQC = null; + + Byte snpchr = snpChr.get(s); + Integer snpchrpos = snpChrPos.get(s); + boolean snphaspropermapping = true; + if (snpchr == null || snpchrpos == null || snpchr == -1) { + snpchr = -1; + snpchrpos = -1; + snphaspropermapping = false; + } + + StringBuilder zscoretableout = new StringBuilder(); + + int numDSPassingQC = 0; + + HashSet probesTestedHash = new HashSet(); + boolean[] testprobes = new boolean[probes.size()]; + for (int p = 0; p < probes.size(); p++) { + byte probechr = probeTranslation.getProbeChr(p); + int probechrpos = probeTranslation.getProbeChrPos(p); + boolean testprobe = false; + + if (m_settings.isCis() && m_settings.isTrans()) { + testprobe = true; + } else if (m_settings.isCis() && !m_settings.isTrans()) { + if (snpchr < 1 || probechr < 1) { + testprobe = false; + } else if (probechr == snpchr) { + if (Math.abs(snpchrpos - probechrpos) < m_settings.getCisdistance()) { + testprobe = true; + } else { + testprobe = false; + } + } else { + testprobe = false; + } + } else if (!m_settings.isCis() && m_settings.isTrans()) { + if (snpchr < 1 || probechr < 1) { + testprobe = false; + } else if (probechr == snpchr) { + if (Math.abs(snpchrpos - probechrpos) > m_settings.getTransdistance()) { + testprobe = true; + } else { + testprobe = false; + } + } else { + testprobe = true; + } + } + testprobes[p] = testprobe; + + if (testprobe) { + probesTestedHash.add(p); + } + } + + + for (int d = 0; d < ds.length; d++) { + + Integer snpId = snpTranslation[d][s]; + + if (snpId != null) { + + BinaryResultSNP snpObject = pack.getSNPObject(d); // ds[d].getSnps()[snpId]; // long pointer = snpObject.getzScoreIndex(); // long nextpointer = -1; @@ -192,162 +190,159 @@ protected void analyze(MetaAnalysisWorkPackage pack) { // nextpointer = snpObject2.getzScoreIndex(); // } - byte[] data = pack.getData(d); - Float[] zscores = null; - if (data != null) { - try { - zscores = inflate(data, ds[d].getNumProbes()); // - pack.setData(d, null); - } catch (DataFormatException ex) { - Logger.getLogger(MetaAnalysisCalculationThread.class.getName()).log(Level.SEVERE, null, ex); - } + byte[] data = pack.getData(d); + Float[] zscores = null; + if (data != null) { + try { + zscores = inflate(data, ds[d].getNumProbes()); // + pack.setData(d, null); + } catch (DataFormatException ex) { + Logger.getLogger(MetaAnalysisCalculationThread.class.getName()).log(Level.SEVERE, null, ex); + } - if (zscores != null) { - numDSPassingQC++; - // weight for dataset d - int nrSamples = snpObject.getNumsamples(); - double weight = Descriptives.getSqrt(nrSamples); + if (zscores != null) { + numDSPassingQC++; + // weight for dataset d + int nrSamples = snpObject.getNumsamples(); + double weight = Descriptives.getSqrt(nrSamples); - for (int p = 0; p < probes.size(); p++) { + for (int p = 0; p < probes.size(); p++) { - boolean testprobe = testprobes[p]; - if (testprobe) { - Integer probeId = probeTranslationLookupTable[d][p]; + boolean testprobe = testprobes[p]; + if (testprobe) { + Integer probeId = probeTranslationLookupTable[d][p]; - if (!testprobe && probeId != null) { - zscores[probeId] = null; - } else if (probeId != null && testprobe) { - if (zscores[probeId] != null) { + if (!testprobe && probeId != null) { + zscores[probeId] = null; + } else if (probeId != null && testprobe) { + if (zscores[probeId] != null) { - totalNrSamples[p] += nrSamples; - r.dspassingqc[p][d] = true; - r.numSamples[p][d] = nrSamples; + totalNrSamples[p] += nrSamples; + r.dspassingqc[p][d] = true; + r.numSamples[p][d] = nrSamples; - double zscore = zscores[probeId]; + double zscore = zscores[probeId]; - r.datasets[d] = ds[d].getM_name(); - dsPassQC[p]++; + r.datasets[d] = ds[d].getM_name().intern(); + dsPassQC[p]++; - if (firstSNPPassingQC == null) { - firstSNPPassingQC = snpObject; - } else { - Boolean flipalleles = flipalleles(firstSNPPassingQC, snpObject); - if (flipalleles == null) { - System.err.println("ERROR! SNP alleles cannot be matched for snp\t" + snpObject.getName() + "\tin dataset\t" + d); - System.err.println("This SNP will be excluded from further research"); - r.passesQC = false; - } else if (flipalleles) { - zscore = -zscore; - zscoreflipped[d] = true; - } - } + if (firstSNPPassingQC == null) { + firstSNPPassingQC = snpObject; + } else { + Boolean flipalleles = flipalleles(firstSNPPassingQC, snpObject); + if (flipalleles == null) { + System.err.println("ERROR! SNP alleles cannot be matched for snp\t" + snpObject.getName() + "\tin dataset\t" + d); + System.err.println("This SNP will be excluded from further research"); + r.passesQC = false; + } else if (flipalleles) { + zscore = -zscore; + zscoreflipped[d] = true; + } + } - r.datasetZScores[p][d] = new Double(zscore); + r.datasetZScores[p][d] = new Double(zscore); // if (verbose) { // System.out.println(d + "\t" + r.datasetZScores[p][d]); // } - zSumAbsolute[p] += Math.abs(zscore * weight); - zSum[p] += (zscore * weight); - } else { - } - } - } - } - for (int i = 0; i < zscores.length; i++) { - zscores[i] = null; - } - } - } - - - - - } - } + zSumAbsolute[p] += Math.abs(zscore * weight); + zSum[p] += (zscore * weight); + } else { + } + } + } + } + for (int i = 0; i < zscores.length; i++) { + zscores[i] = null; + } + } + } + + + } + } // if (verbose) { //// System.exit(0); // } - pack.clearByteData(); - - int numDSThatMinimallyShouldHaveEffect = m_settings.getSnpDatasetPresenceThreshold(); - if (numDSThatMinimallyShouldHaveEffect == 0) { - numDSThatMinimallyShouldHaveEffect = 1; - } - - if (numDSPassingQC >= numDSThatMinimallyShouldHaveEffect) { - pack.setPassedQC(true); - Double[] metaZPerProbe = null; - if (m_settings.isMakezscoretable()) { - metaZPerProbe = new Double[probes.size()]; - } - int probesTested = 0; - numSNPs++; - for (int p = 0; p < probes.size(); p++) { - - if (dsPassQC[p] >= numDSThatMinimallyShouldHaveEffect && totalNrSamples[p] > 0) { - numEffects++; - probesTestedHash.add(p); - probesTested++; - double zSumVal = zSum[p]; - double sqrtSample = Descriptives.getSqrt(totalNrSamples[p]); - double metaZScore = zSumVal / sqrtSample; - double pValueOverall = Descriptives.convertZscoreToPvalue(metaZScore); - - double zSumValAbsolute = zSumAbsolute[p]; - double zScoreAbs = zSumValAbsolute / sqrtSample; - double pValueOverallAbs = Descriptives.convertZscoreToPvalue(zScoreAbs); - - - - boolean outputeqtl = false; - if (m_settings.isMakezscoretable()) { - outputeqtl = true; - } else if (pValueOverall <= pvaluethreshold.getPvalue()) { - outputeqtl = true; - } - - if (outputeqtl) { - result[p] = new EQTL(); - EQTL e = result[p]; - e.setRsChr(snpChr.get(s)); - e.setRsChrPos(snpChrPos.get(s)); - e.setProbeChr(probeTranslation.getProbeChr(p)); - e.setProbeChrPos(probeTranslation.getProbeChrPos(p)); - e.setDatasets(r.datasets); - e.setAlleleAssessed(BaseAnnot.toString(firstSNPPassingQC.getAssessedAllele())); - byte[] alleles = firstSNPPassingQC.getAlleles(); - String alleleStr = BaseAnnot.toString(alleles[0]) + "/" + BaseAnnot.toString(alleles[1]); - e.setAlleles(alleleStr); - e.setDatasetZScores(r.datasetZScores[p]); - e.setZscore(metaZScore); - e.setPvalue(pValueOverall); - e.setZscoreAbs(zScoreAbs); - e.setPvalueAbs(pValueOverallAbs); - - if (m_settings.isUseAbsoluteZscore()) { - e.setUseAbsoluteZScore(); - } - - if (pValueOverallAbs < 1) { - for (int d1 = 0; d1 < ds.length; d1++) { - boolean ds1PassesQC = r.dspassingqc[p][d1]; - if (ds1PassesQC) { - double datasetZScore = r.datasetZScores[p][d1]; - if (zscoreflipped[d1]) { - datasetZScore = -datasetZScore; - } - for (int d2 = d1 + 1; d2 < ds.length; d2++) { - if (r.dspassingqc[p][d2]) { - double zscore2 = r.datasetZScores[p][d2]; - if (zscoreflipped[d2]) { - zscore2 = -zscore2; - } - if (zs != null) { + pack.clearByteData(); + + int numDSThatMinimallyShouldHaveEffect = m_settings.getSnpDatasetPresenceThreshold(); + if (numDSThatMinimallyShouldHaveEffect == 0) { + numDSThatMinimallyShouldHaveEffect = 1; + } + + if (numDSPassingQC >= numDSThatMinimallyShouldHaveEffect) { + pack.setPassedQC(true); + Double[] metaZPerProbe = null; + if (m_settings.isMakezscoretable()) { + metaZPerProbe = new Double[probes.size()]; + } + int probesTested = 0; + numSNPs++; + for (int p = 0; p < probes.size(); p++) { + + if (dsPassQC[p] >= numDSThatMinimallyShouldHaveEffect && totalNrSamples[p] > 0) { + numEffects++; + probesTestedHash.add(p); + probesTested++; + double zSumVal = zSum[p]; + double sqrtSample = Descriptives.getSqrt(totalNrSamples[p]); + double metaZScore = zSumVal / sqrtSample; + double pValueOverall = Descriptives.convertZscoreToPvalue(metaZScore); + + double zSumValAbsolute = zSumAbsolute[p]; + double zScoreAbs = zSumValAbsolute / sqrtSample; + double pValueOverallAbs = Descriptives.convertZscoreToPvalue(zScoreAbs); + + + boolean outputeqtl = false; + if (m_settings.isMakezscoretable()) { + outputeqtl = true; + } else if (pValueOverall <= pvaluethreshold.getPvalue()) { + outputeqtl = true; + } + + if (outputeqtl) { + result[p] = new EQTL(); + EQTL e = result[p]; + e.setRsChr(snpChr.get(s)); + e.setRsChrPos(snpChrPos.get(s)); + e.setProbeChr(probeTranslation.getProbeChr(p)); + e.setProbeChrPos(probeTranslation.getProbeChrPos(p)); + e.setDatasets(r.datasets); + e.setAlleleAssessed(BaseAnnot.toString(firstSNPPassingQC.getAssessedAllele()).intern()); + byte[] alleles = firstSNPPassingQC.getAlleles(); + String alleleStr = (BaseAnnot.toString(alleles[0]) + "/" + BaseAnnot.toString(alleles[1])).intern(); + e.setAlleles(alleleStr); + e.setDatasetZScores(r.datasetZScores[p]); + e.setZscore(metaZScore); + e.setPvalue(pValueOverall); + e.setZscoreAbs(zScoreAbs); + e.setPvalueAbs(pValueOverallAbs); + + if (m_settings.isUseAbsoluteZscore()) { + e.setUseAbsoluteZScore(); + } + + if (pValueOverallAbs < 1) { + for (int d1 = 0; d1 < ds.length; d1++) { + boolean ds1PassesQC = r.dspassingqc[p][d1]; + if (ds1PassesQC) { + double datasetZScore = r.datasetZScores[p][d1]; + if (zscoreflipped[d1]) { + datasetZScore = -datasetZScore; + } + for (int d2 = d1 + 1; d2 < ds.length; d2++) { + if (r.dspassingqc[p][d2]) { + double zscore2 = r.datasetZScores[p][d2]; + if (zscoreflipped[d2]) { + zscore2 = -zscore2; + } + if (zs != null) { // if ((datasetZScore < -10 && zscore2 > 10) || (datasetZScore > 10 && zscore2 < -10)) { // System.out.println(""); // System.out.println("Opposite effect: "); @@ -393,163 +388,162 @@ protected void analyze(MetaAnalysisWorkPackage pack) { // // System.out.println(""); // } - if (pValueOverall < 1E-15) { - zs.draw(new Double(datasetZScore), new Double(zscore2), d1, d2); - } - } - } - } - if (zs != null && pValueOverall < 1E-15) { - zs.draw(new Double(datasetZScore), new Double(metaZScore), d1, ds.length); - } - } - } - } - // - e.setDatasetsSamples(r.numSamples[p]); - e.setProbe(probes.get(p)); - e.setRsName(firstSNPPassingQC.getName()); - e.setProbeHUGO(probeTranslation.getProbeSymbol(p)); - - } - - if (m_settings.isMakezscoretable()) { - metaZPerProbe[p] = metaZScore; - } - } else { - r.finalzscores[p] = null; - } - } - - if (m_settings.isMakezscoretable()) { - - if (firstSNPPassingQC != null) { - zscoretableout.append(snps.get(s)); - zscoretableout.append("\t").append(BaseAnnot.toString(firstSNPPassingQC.getAlleles()[0])).append("/").append(BaseAnnot.toString(firstSNPPassingQC.getAlleles()[1])).append("\t").append(BaseAnnot.toString(firstSNPPassingQC.getAssessedAllele())); - - for (int i = 0; i < metaZPerProbe.length; i++) { - zscoretableout.append("\t").append(metaZPerProbe[i]); - metaZPerProbe[i] = null; - } - metaZPerProbe = null; - pack.setZScoreOut(zscoretableout.toString()); - } - } - r.clearData(); - - - if (numDSPassingQC > 0) { - pack.setProbesTestedHash(probesTestedHash); - } else { - pack.setProbesTestedHash(new HashSet()); - } - pack.setNumOfTestedProbes(probesTested); - pack.setResult(result); - try { - m_queue_output.put(pack); - } catch (InterruptedException ex) { - ex.printStackTrace(); - } - - } - - - - } - - // TODO: AT / GC SNPs?? - public Boolean flipalleles(BinaryResultSNP firstSNPPassingQC, BinaryResultSNP snpObject) { - byte[] allelesfirst = firstSNPPassingQC.getAlleles(); - byte allelefirstassessed = firstSNPPassingQC.getAssessedAllele(); - - byte[] allelessecond = snpObject.getAlleles(); - byte allelesecondassessed = snpObject.getAssessedAllele(); - - int nridenticalalleles = 0; - - for (int i = 0; i < allelesfirst.length; i++) { - byte allele1 = allelesfirst[i]; - for (int j = 0; j < allelessecond.length; j++) { - if (allelessecond[j] == allele1) { - nridenticalalleles++; - } - } - } - - if (nridenticalalleles == 2) { - // alleles are identical. check if same allele was assessed... - if (allelefirstassessed == allelesecondassessed) { - return false; - } else { - return true; - } - } else { - // try complement - allelessecond = convertToComplementaryAlleles(allelessecond); - allelesecondassessed = BaseAnnot.getComplement(allelesecondassessed); - nridenticalalleles = 0; - - for (int i = 0; i < allelesfirst.length; i++) { - byte allele1 = allelesfirst[i]; - for (int j = 0; j < allelessecond.length; j++) { - if (allelessecond[j] == allele1) { - nridenticalalleles++; - } - } - } - - if (nridenticalalleles == 2) { - // alleles are identical. check if same allele was assessed... - if (allelefirstassessed == allelesecondassessed) { - return false; - } else { - return true; - } - } - } - return null; - } - - public byte[] convertToComplementaryAlleles(byte[] allelesToCompare) { - byte[] allelesComplementary = new byte[2]; - for (int a = 0; a < 2; a++) { - allelesComplementary[a] = BaseAnnot.getComplement(allelesToCompare[a]); - } - return allelesComplementary; - } - - protected Float[] inflate(byte[] buffer, int numElems) throws DataFormatException { - inflater.setInput(buffer); - inflater.finished(); - byte[] decompressed = new byte[numElems * 4]; - inflater.inflate(decompressed); - - long actuallydecompressed = inflater.getBytesWritten(); - if (actuallydecompressed != numElems * 4) { - throw new DataFormatException("IO Error: uncompressed data does not correspond to the size requested\t" + actuallydecompressed + "\t" + numElems * 4); - } - - inflater.reset(); - - ByteBuffer bytebuffer = ByteBuffer.wrap(decompressed); - Float[] output = new Float[numElems]; - int ctr = 0; - for (int i = 0; i < numElems; i++) { - Float f = bytebuffer.getFloat(); - if (f.isNaN()) { - f = null; - } else { - ctr++; - } - output[i] = f; - } - - decompressed = null; - - if (ctr == 0) { - return null; - } else { - return output; - } - } + if (pValueOverall < 1E-15) { + zs.draw(new Double(datasetZScore), new Double(zscore2), d1, d2); + } + } + } + } + if (zs != null && pValueOverall < 1E-15) { + zs.draw(new Double(datasetZScore), new Double(metaZScore), d1, ds.length); + } + } + } + } + // + e.setDatasetsSamples(r.numSamples[p]); + e.setProbe(probes.get(p).intern()); + e.setRsName(firstSNPPassingQC.getName().intern()); + e.setProbeHUGO(probeTranslation.getProbeSymbol(p).intern()); + + } + + if (m_settings.isMakezscoretable()) { + metaZPerProbe[p] = metaZScore; + } + } else { + r.finalzscores[p] = null; + } + } + + if (m_settings.isMakezscoretable()) { + + if (firstSNPPassingQC != null) { + zscoretableout.append(snps.get(s)); + zscoretableout.append("\t").append(BaseAnnot.toString(firstSNPPassingQC.getAlleles()[0])).append("/").append(BaseAnnot.toString(firstSNPPassingQC.getAlleles()[1])).append("\t").append(BaseAnnot.toString(firstSNPPassingQC.getAssessedAllele())); + + for (int i = 0; i < metaZPerProbe.length; i++) { + zscoretableout.append("\t").append(metaZPerProbe[i]); + metaZPerProbe[i] = null; + } + metaZPerProbe = null; + pack.setZScoreOut(zscoretableout.toString()); + } + } + r.clearData(); + + + if (numDSPassingQC > 0) { + pack.setProbesTestedHash(probesTestedHash); + } else { + pack.setProbesTestedHash(new HashSet()); + } + pack.setNumOfTestedProbes(probesTested); + pack.setResult(result); + try { + m_queue_output.put(pack); + } catch (InterruptedException ex) { + ex.printStackTrace(); + } + + } + + + } + + // TODO: AT / GC SNPs?? + public Boolean flipalleles(BinaryResultSNP firstSNPPassingQC, BinaryResultSNP snpObject) { + byte[] allelesfirst = firstSNPPassingQC.getAlleles(); + byte allelefirstassessed = firstSNPPassingQC.getAssessedAllele(); + + byte[] allelessecond = snpObject.getAlleles(); + byte allelesecondassessed = snpObject.getAssessedAllele(); + + int nridenticalalleles = 0; + + for (int i = 0; i < allelesfirst.length; i++) { + byte allele1 = allelesfirst[i]; + for (int j = 0; j < allelessecond.length; j++) { + if (allelessecond[j] == allele1) { + nridenticalalleles++; + } + } + } + + if (nridenticalalleles == 2) { + // alleles are identical. check if same allele was assessed... + if (allelefirstassessed == allelesecondassessed) { + return false; + } else { + return true; + } + } else { + // try complement + allelessecond = convertToComplementaryAlleles(allelessecond); + allelesecondassessed = BaseAnnot.getComplement(allelesecondassessed); + nridenticalalleles = 0; + + for (int i = 0; i < allelesfirst.length; i++) { + byte allele1 = allelesfirst[i]; + for (int j = 0; j < allelessecond.length; j++) { + if (allelessecond[j] == allele1) { + nridenticalalleles++; + } + } + } + + if (nridenticalalleles == 2) { + // alleles are identical. check if same allele was assessed... + if (allelefirstassessed == allelesecondassessed) { + return false; + } else { + return true; + } + } + } + return null; + } + + public byte[] convertToComplementaryAlleles(byte[] allelesToCompare) { + byte[] allelesComplementary = new byte[2]; + for (int a = 0; a < 2; a++) { + allelesComplementary[a] = BaseAnnot.getComplement(allelesToCompare[a]); + } + return allelesComplementary; + } + + protected Float[] inflate(byte[] buffer, int numElems) throws DataFormatException { + inflater.setInput(buffer); + inflater.finished(); + byte[] decompressed = new byte[numElems * 4]; + inflater.inflate(decompressed); + + long actuallydecompressed = inflater.getBytesWritten(); + if (actuallydecompressed != numElems * 4) { + throw new DataFormatException("IO Error: uncompressed data does not correspond to the size requested\t" + actuallydecompressed + "\t" + numElems * 4); + } + + inflater.reset(); + + ByteBuffer bytebuffer = ByteBuffer.wrap(decompressed); + Float[] output = new Float[numElems]; + int ctr = 0; + for (int i = 0; i < numElems; i++) { + Float f = bytebuffer.getFloat(); + if (f.isNaN()) { + f = null; + } else { + ctr++; + } + output[i] = f; + } + + decompressed = null; + + if (ctr == 0) { + return null; + } else { + return output; + } + } } diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaAnalysisResultThread.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaAnalysisResultThread.java index 0c1ccc66d..5bde34cc7 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaAnalysisResultThread.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaAnalysisResultThread.java @@ -4,197 +4,201 @@ */ package eqtlmappingpipeline.binarymeta.meta; +import umcg.genetica.io.text.TextFile; +import umcg.genetica.io.trityper.EQTL; + import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.concurrent.LinkedBlockingQueue; -import umcg.genetica.io.text.TextFile; -import umcg.genetica.io.trityper.EQTL; /** - * * @author harmjan */ public class MetaAnalysisResultThread extends Thread { - private final LinkedBlockingQueue m_queue_input; + private final LinkedBlockingQueue m_queue_input; // private double pvaluethreshold = 1; - - - private static String header = "PValue\t" - + "SNPName\t" - + "SNPChr\t" - + "SNPChrPos\t" - + "ProbeName\t" - + "ProbeChr\t" - + "ProbeCenterChrPos\t" - + "CisTrans\t" - + "SNPType\t" - + "AlleleAssessed\t" - + "OverallZScore\t" - + "DatasetsWhereSNPProbePairIsAvailableAndPassesQC\t" - + "DatasetsZScores\t" - + "DatasetsNrSamples\t" - + "IncludedDatasetsMeanProbeExpression\t" - + "IncludedDatasetsProbeExpressionVariance\t" - + "HGNCName\t" - + "IncludedDatasetsCorrelationCoefficient"; - - private int ctr = 0; - private EQTL[] eQTLBuffer = new EQTL[100000]; - private EQTL[] finalEQTLBuffer = new EQTL[0]; - private int nrInFinalBuffer = 0; - private static MetaSettings m_settings; - private int perm; - private String[] datasets; - private final TextFile zscoretable; - private final PValueThreshold pvaluethreshold; - private final ArrayList snps; - private final HashMap> snpProbeSelection; - private final ArrayList probes; - - public MetaAnalysisResultThread(LinkedBlockingQueue input, - MetaSettings m_settings, - String[] datasets, - int perm, - TextFile zscoretable, PValueThreshold p, ArrayList snps, HashMap> snpProbeSelection, ArrayList probes) { - this.m_settings = m_settings; - this.datasets = datasets; - this.perm = perm; - this.zscoretable = zscoretable; - this.pvaluethreshold = p; - m_queue_input = input; - this.snps = snps; - this.snpProbeSelection = snpProbeSelection; - this.probes = probes; - } - TextFile snpout = null; - - @Override - public void run() { - boolean poison = false; - try { - snpout = new TextFile(m_settings.getOutput() + "snpsandnreqtls.txt", TextFile.W); - while (!poison) { - try { - MetaAnalysisWorkPackage pack = m_queue_input.take(); - if (!pack.getPoison()) { - Integer snpnum = pack.getSNPNum(); - String snp = snps.get(snpnum); - if (snpProbeSelection == null || snpProbeSelection.containsKey(snp)) { - analyze(pack); - } + + + private static String header = "PValue\t" + + "SNPName\t" + + "SNPChr\t" + + "SNPChrPos\t" + + "ProbeName\t" + + "ProbeChr\t" + + "ProbeCenterChrPos\t" + + "CisTrans\t" + + "SNPType\t" + + "AlleleAssessed\t" + + "OverallZScore\t" + + "DatasetsWhereSNPProbePairIsAvailableAndPassesQC\t" + + "DatasetsZScores\t" + + "DatasetsNrSamples\t" + + "IncludedDatasetsMeanProbeExpression\t" + + "IncludedDatasetsProbeExpressionVariance\t" + + "HGNCName\t" + + "IncludedDatasetsCorrelationCoefficient"; + + private int ctr = 0; + private EQTL[] eQTLBuffer = new EQTL[100000]; + private EQTL[] finalEQTLBuffer = new EQTL[0]; + private int nrInFinalBuffer = 0; + private static MetaSettings m_settings; + private int perm; + private String[] datasets; + private final TextFile zscoretable; + private final PValueThreshold pvaluethreshold; + private final ArrayList snps; + private final HashMap> snpProbeSelection; + private final ArrayList probes; + + public MetaAnalysisResultThread(LinkedBlockingQueue input, + MetaSettings m_settings, + String[] datasets, + int perm, + TextFile zscoretable, PValueThreshold p, ArrayList snps, HashMap> snpProbeSelection, ArrayList probes) { + this.m_settings = m_settings; + this.datasets = datasets; + this.perm = perm; + this.zscoretable = zscoretable; + this.pvaluethreshold = p; + m_queue_input = input; + this.snps = snps; + this.snpProbeSelection = snpProbeSelection; + this.probes = probes; + } + + TextFile snpout = null; + + @Override + public void run() { + boolean poison = false; + try { + snpout = new TextFile(m_settings.getOutput() + "snpsandnreqtls.txt", TextFile.W); + while (!poison) { + try { + MetaAnalysisWorkPackage pack = m_queue_input.take(); + if (!pack.getPoison()) { + Integer snpnum = pack.getSNPNum(); + String snp = snps.get(snpnum); + if (snpProbeSelection == null || snpProbeSelection.containsKey(snp)) { + analyze(pack); + } // if(taken % printperiterations == 0){ // System.out.println("Thread "+this.getName()+" calculated "+taken+" workpackages."); // } - } else { - poison = pack.getPoison(); + } else { + poison = pack.getPoison(); // System.out.println("Thread " + m_name + " got killed by a poisonous workpackage, but was bravely able to perform\t" + testsPerformed + "\ttests"); - } - - } catch (InterruptedException ex) { - ex.printStackTrace(); - } - } - - if (ctr > 0) { - mergebuffers(ctr); - } - snpout.close(); - - // write eQTL results.. - - writeresults(); - - TextFile out = new TextFile(m_settings.getOutput() + "/NumberOfEQTLSTotal.txt", TextFile.W); - out.writeln("Number of eQTLs in total: " + totalNumberOfEQTLs); - System.out.println("Number of eQTLs in total: " + totalNumberOfEQTLs); - out.writeln("Number of snps in total: " + uniqueSNPs.size()); - out.writeln("Number of snps in total not passing QC: " + uniqueSNPsNotPassingQC.size()); - - - System.out.println("Number of snps in total: " + uniqueSNPs.size()); - TextFile out2 = new TextFile(m_settings.getOutput() + "/TestedSNPs.txt", TextFile.W); - List list = new ArrayList(uniqueSNPs); - out2.writeList(list); - out2.close(); - - out2 = new TextFile(m_settings.getOutput() + "/TestedSNPsNPQC.txt", TextFile.W); - list = new ArrayList(uniqueSNPsNotPassingQC); - out2.writeList(list); - - out.writeln("Number of probes in total: " + uniqueProbes.size()); - System.out.println("Number of probes in total: " + uniqueProbes.size()); - out2 = new TextFile(m_settings.getOutput() + "/TestedProbes.txt", TextFile.W); - List list2 = new ArrayList(uniqueProbes); - ArrayList list2str = new ArrayList(); - for (Integer i : list2) { - list2str.add("" + i); - } - - out2.writeList(list2str); - out.close(); - - } catch (IOException e) { - e.printStackTrace(); - } - } - private HashSet uniqueSNPs = new HashSet(); - private HashSet uniqueSNPsNotPassingQC = new HashSet(); - private HashSet uniqueProbes = new HashSet(); - private int totalNumberOfEQTLs = 0; - - private void analyze(MetaAnalysisWorkPackage pack) { - - Integer snpnum = pack.getSNPNum(); - String snp = snps.get(snpnum); - - HashSet allowedProbes = null; - if (snpProbeSelection != null) { - allowedProbes = snpProbeSelection.get(snp); - } - - Integer[] probeList = pack.getListOfTestedProbes(); - for (int i = 0; i < probeList.length; i++) { - String probe = probes.get(probeList[i]); - if (allowedProbes == null || allowedProbes.contains(probe)) { - totalNumberOfEQTLs++; - uniqueProbes.add(probeList[i]); - } - } - - if (pack.getPassedQC()) { - uniqueSNPs.add(snps.get(snpnum)); - } else { - uniqueSNPsNotPassingQC.add(snps.get(snpnum)); - } - if (m_settings.isMakezscoretable() && zscoretable != null) { - try { - String zscoreout = pack.getZScoreOut(); - if (zscoreout != null) { - zscoretable.writeln(zscoreout); - pack.setZScoreOut(null); - } - } catch (IOException e) { - e.printStackTrace(); - } - } - - EQTL[] finalEQTLs = pack.getResult(); - - int nreQTLsForSNP = 0; - for (int p = 0; p < finalEQTLs.length; p++) { + } + + } catch (InterruptedException ex) { + ex.printStackTrace(); + } + } + + if (ctr > 0) { + mergebuffers(ctr); + } + snpout.close(); + + java.util.Arrays.sort(finalEQTLBuffer); + + // write eQTL results.. + + writeresults(); + + TextFile out = new TextFile(m_settings.getOutput() + "/NumberOfEQTLSTotal.txt", TextFile.W); + out.writeln("Number of eQTLs in total: " + totalNumberOfEQTLs); + System.out.println("Number of eQTLs in total: " + totalNumberOfEQTLs); + out.writeln("Number of snps in total: " + uniqueSNPs.size()); + out.writeln("Number of snps in total not passing QC: " + uniqueSNPsNotPassingQC.size()); + + + System.out.println("Number of snps in total: " + uniqueSNPs.size()); + TextFile out2 = new TextFile(m_settings.getOutput() + "/TestedSNPs.txt", TextFile.W); + List list = new ArrayList(uniqueSNPs); + out2.writeList(list); + out2.close(); + + out2 = new TextFile(m_settings.getOutput() + "/TestedSNPsNPQC.txt", TextFile.W); + list = new ArrayList(uniqueSNPsNotPassingQC); + out2.writeList(list); + + out.writeln("Number of probes in total: " + uniqueProbes.size()); + System.out.println("Number of probes in total: " + uniqueProbes.size()); + out2 = new TextFile(m_settings.getOutput() + "/TestedProbes.txt", TextFile.W); + List list2 = new ArrayList(uniqueProbes); + ArrayList list2str = new ArrayList(); + for (Integer i : list2) { + list2str.add("" + i); + } + + out2.writeList(list2str); + out.close(); + + } catch (IOException e) { + e.printStackTrace(); + } + } + + private HashSet uniqueSNPs = new HashSet(); + private HashSet uniqueSNPsNotPassingQC = new HashSet(); + private HashSet uniqueProbes = new HashSet(); + private int totalNumberOfEQTLs = 0; + + private void analyze(MetaAnalysisWorkPackage pack) { + + Integer snpnum = pack.getSNPNum(); + String snp = snps.get(snpnum).intern(); + + HashSet allowedProbes = null; + if (snpProbeSelection != null) { + allowedProbes = snpProbeSelection.get(snp); + } + + Integer[] probeList = pack.getListOfTestedProbes(); + for (int i = 0; i < probeList.length; i++) { + String probe = probes.get(probeList[i]); + if (probe != null && (allowedProbes == null || allowedProbes.contains(probe))) { + totalNumberOfEQTLs++; + uniqueProbes.add(probeList[i]); + } + } + + if (pack.getPassedQC()) { + uniqueSNPs.add(snps.get(snpnum).intern()); + } else { + uniqueSNPsNotPassingQC.add(snps.get(snpnum).intern()); + } + if (m_settings.isMakezscoretable() && zscoretable != null) { + try { + String zscoreout = pack.getZScoreOut(); + if (zscoreout != null) { + zscoretable.writeln(zscoreout); + pack.setZScoreOut(null); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + + EQTL[] finalEQTLs = pack.getResult(); + + int nreQTLsForSNP = 0; + for (int p = 0; p < finalEQTLs.length; p++) { // if (finalEQTLs[p] != null) { //// uniqueProbes.add(finalEQTLs[p].getProbe()); // } - if (finalEQTLs[p] != null && finalEQTLs[p].getPvalue() <= pvaluethreshold.getPvalue() && (allowedProbes == null || allowedProbes.contains(finalEQTLs[p].getProbe()))) { - nreQTLsForSNP++; - // check cis / trans constraints ... + if (finalEQTLs[p] != null && finalEQTLs[p].getPvalue() <= pvaluethreshold.getPvalue() && (allowedProbes == null || allowedProbes.contains(finalEQTLs[p].getProbe()))) { + nreQTLsForSNP++; + // check cis / trans constraints ... // if(finalEQTLs[p].getProbeChr()) // boolean includeEQTL = true; // if(transAnalysis && !cisAnalysis){ @@ -212,102 +216,101 @@ private void analyze(MetaAnalysisWorkPackage pack) { // } // if (includeEQTL) { - eQTLBuffer[ctr] = finalEQTLs[p]; - ctr++; - if (ctr == eQTLBuffer.length) { - mergebuffers(ctr); - ctr = 0; + eQTLBuffer[ctr] = finalEQTLs[p]; + ctr++; + if (ctr == eQTLBuffer.length) { + mergebuffers(ctr); + ctr = 0; // System.out.println("SNPs tested: "+s+"/"+snps.size()+", threshold: "+pvaluethreshold); - } + } // } - } else { - if (finalEQTLs[p] != null) { - finalEQTLs[p].clearData(); - finalEQTLs[p] = null; - } - } - } - finalEQTLs = null; - try { - snpout.writeln(snps.get(pack.getSNPNum()) + "\t" + nreQTLsForSNP); - } catch (Exception e) { - e.printStackTrace(); - } - pack.clearData(); - pack = null; - } - - protected void mergebuffers(int ctr) { - EQTL[] toMerge = null; - if (ctr < eQTLBuffer.length) { - toMerge = new EQTL[ctr]; - System.arraycopy(eQTLBuffer, 0, toMerge, 0, ctr); - } else { - toMerge = eQTLBuffer; - } - - EQTL[] tmp = new EQTL[finalEQTLBuffer.length + toMerge.length]; - System.arraycopy(toMerge, 0, tmp, 0, toMerge.length); - System.arraycopy(finalEQTLBuffer, 0, tmp, toMerge.length, finalEQTLBuffer.length); - - java.util.Arrays.sort(tmp); - - nrInFinalBuffer += toMerge.length; - if (nrInFinalBuffer < m_settings.getFinalEQTLBufferMaxLength()) { - finalEQTLBuffer = tmp; - } else { - - finalEQTLBuffer = new EQTL[m_settings.getFinalEQTLBufferMaxLength()]; + } else { + if (finalEQTLs[p] != null) { + finalEQTLs[p].clearData(); + finalEQTLs[p] = null; + } + } + } + finalEQTLs = null; + try { + snpout.writeln(snps.get(pack.getSNPNum()) + "\t" + nreQTLsForSNP); + } catch (Exception e) { + e.printStackTrace(); + } + pack.clearData(); + pack = null; + } + + protected void mergebuffers(int ctr) { + EQTL[] toMerge = null; + if (ctr < eQTLBuffer.length) { + toMerge = new EQTL[ctr]; + System.arraycopy(eQTLBuffer, 0, toMerge, 0, ctr); + } else { + toMerge = eQTLBuffer; + } + + EQTL[] tmp = new EQTL[finalEQTLBuffer.length + toMerge.length]; + System.arraycopy(toMerge, 0, tmp, 0, toMerge.length); + System.arraycopy(finalEQTLBuffer, 0, tmp, toMerge.length, finalEQTLBuffer.length); + + + nrInFinalBuffer += toMerge.length; + if (nrInFinalBuffer < m_settings.getFinalEQTLBufferMaxLength()) { + finalEQTLBuffer = tmp; + } else { + + java.util.Arrays.sort(tmp); + finalEQTLBuffer = new EQTL[m_settings.getFinalEQTLBufferMaxLength()]; // System.out.println(finalEQTLBuffer.length+"\t"+tmp.length); - System.arraycopy(tmp, 0, finalEQTLBuffer, 0, m_settings.getFinalEQTLBufferMaxLength()); - nrInFinalBuffer = m_settings.getFinalEQTLBufferMaxLength(); - pvaluethreshold.setPvalue(finalEQTLBuffer[nrInFinalBuffer - 1].getPvalue()); - - } - } + System.arraycopy(tmp, 0, finalEQTLBuffer, 0, m_settings.getFinalEQTLBufferMaxLength()); + nrInFinalBuffer = m_settings.getFinalEQTLBufferMaxLength(); + pvaluethreshold.setPvalue(finalEQTLBuffer[nrInFinalBuffer - 1].getPvalue()); - private void writeresults() throws IOException { + } + } + private void writeresults() throws IOException { - TextFile out = null; - if (perm > 0) { - out = new TextFile(m_settings.getOutput() + "PermutedEQTLsPermutationRound" + perm + ".txt.gz", TextFile.W); - } else { - out = new TextFile(m_settings.getOutput() + "eQTLs.txt", TextFile.W); - } + TextFile out = null; + if (perm > 0) { + out = new TextFile(m_settings.getOutput() + "PermutedEQTLsPermutationRound" + perm + ".txt.gz", TextFile.W); + } else { + out = new TextFile(m_settings.getOutput() + "eQTLs.txt.gz", TextFile.W); + } - out.write(header + "\n"); + out.write(header + "\n"); - for (int i = 0; i < finalEQTLBuffer.length; i++) { - finalEQTLBuffer[i].setDatasets(datasets); - out.writeln(finalEQTLBuffer[i].toString()); - } + for (int i = 0; i < finalEQTLBuffer.length; i++) { + finalEQTLBuffer[i].setDatasets(datasets); + out.writeln(finalEQTLBuffer[i].toString()); + } - out.close(); + out.close(); - TextFile oppositeEffects = null; - if (perm > 0) { - oppositeEffects = new TextFile(m_settings.getOutput() + "OppositeEffects-PermutedEQTLsPermutationRound" + perm + ".txt.gz", TextFile.W); - } else { - oppositeEffects = new TextFile(m_settings.getOutput() + "OppositeEffects-eQTLs.txt", TextFile.W); - } + TextFile oppositeEffects = null; + if (perm > 0) { + oppositeEffects = new TextFile(m_settings.getOutput() + "OppositeEffects-PermutedEQTLsPermutationRound" + perm + ".txt.gz", TextFile.W); + } else { + oppositeEffects = new TextFile(m_settings.getOutput() + "OppositeEffects-eQTLs.txt", TextFile.W); + } - for (int i = 0; i < finalEQTLBuffer.length; i++) { - String oppositeEffectIndicator = ""; - double pValueOverall = finalEQTLBuffer[i].getPvalue(); - double pValueAbs = finalEQTLBuffer[i].getPvalueAbs(); - if (pValueAbs < pValueOverall) { - oppositeEffectIndicator = "OppositeEffect"; - if (pValueAbs <= pValueOverall / 100000) { - oppositeEffectIndicator = "StrongOppositeEffect"; - } + for (int i = 0; i < finalEQTLBuffer.length; i++) { + String oppositeEffectIndicator = ""; + double pValueOverall = finalEQTLBuffer[i].getPvalue(); + double pValueAbs = finalEQTLBuffer[i].getPvalueAbs(); + if (pValueAbs < pValueOverall) { + oppositeEffectIndicator = "OppositeEffect"; + if (pValueAbs <= pValueOverall / 100000) { + oppositeEffectIndicator = "StrongOppositeEffect"; + } - oppositeEffects.writeln(oppositeEffectIndicator + "\t" + pValueAbs + "\t" + finalEQTLBuffer[i].getZscoreAbs() + "\t" + finalEQTLBuffer[i].toString()); - } - } - oppositeEffects.close(); - } + oppositeEffects.writeln(oppositeEffectIndicator + "\t" + pValueAbs + "\t" + finalEQTLBuffer[i].getZscoreAbs() + "\t" + finalEQTLBuffer[i].toString()); + } + } + oppositeEffects.close(); + } } diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaAnalyze.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaAnalyze.java index a286389bb..54b3e5719 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaAnalyze.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaAnalyze.java @@ -6,25 +6,25 @@ // //import eqtlmappingpipeline.gpio.binary.Dataset; -import umcg.genetica.io.trityper.probeannotation.ProbeTranslation; import eqtlmappingpipeline.binarymeta.meta.graphics.ZScorePlot; +import eqtlmappingpipeline.metaqtl3.FDR; +import eqtlmappingpipeline.metaqtl3.graphics.EQTLDotPlot; +import umcg.genetica.io.Gpio; +import umcg.genetica.io.text.TextFile; +import umcg.genetica.io.trityper.EQTL; import umcg.genetica.io.trityper.bin.BinaryResultDataset; import umcg.genetica.io.trityper.bin.BinaryResultProbe; import umcg.genetica.io.trityper.bin.BinaryResultSNP; -import java.util.Arrays; +import umcg.genetica.io.trityper.probeannotation.ProbeTranslation; import umcg.genetica.math.stats.Descriptives; -import eqtlmappingpipeline.metaqtl3.FDR; -import eqtlmappingpipeline.metaqtl3.graphics.EQTLDotPlot; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.concurrent.LinkedBlockingQueue; import java.util.zip.DataFormatException; -import umcg.genetica.io.Gpio; -import umcg.genetica.io.text.TextFile; -import umcg.genetica.io.trityper.EQTL; ///** // * @@ -32,117 +32,119 @@ // */ public class MetaAnalyze { - protected static MetaSettings m_settings; - protected BinaryResultDataset[] ds; - protected ArrayList probes; - protected ArrayList snps; - protected Integer[][] snpTranslation; - protected int[] pvaluedistribution; - protected EQTL[] eQTLBuffer; - protected EQTL[] finalEQTLBuffer; - protected int nrInFinalBuffer = 0; - protected double pvaluethreshold; - protected ArrayList snpChr; - protected ArrayList snpChrPos; - protected ProbeTranslation probeTranslation; - protected Integer[][] probeTranslationLookupTable; - public static String header = "PValue\t" - + "SNPName\t" - + "SNPChr\t" - + "SNPChrPos\t" - + "ProbeName\t" - + "ProbeChr\t" - + "ProbeCenterChrPos\t" - + "CisTrans\t" - + "SNPType\t" - + "AlleleAssessed\t" - + "OverallZScore\t" - + "DatasetsWhereSNPProbePairIsAvailableAndPassesQC\t" - + "DatasetsZScores\t" - + "DatasetsNrSamples\t" - + "IncludedDatasetsMeanProbeExpression\t" - + "IncludedDatasetsProbeExpressionVariance\t" - + "HGNCName\t" - + "IncludedDatasetsCorrelationCoefficient"; - protected double[] zsumPerSNP; - protected int[] zsumSNPsNumberOfProbes; - protected double[] zsumPerProbe; - protected int[] zsumProbesNumberOfSNPs; - protected ZScorePlot zs; - protected TextFile zscoretable; - protected HashSet uniqueProbes; - protected HashSet uniqueSNPs; - protected int nrTotalSamples; - protected int numSNPs; - protected int numProbes; - private HashSet probeListToAnalyze; - - public void init(String settingsFile, String texttoreplace, String replacetextwith) throws IOException { - m_settings = new MetaSettings(); - m_settings.parse(settingsFile, texttoreplace, replacetextwith); - probeTranslation = new ProbeTranslation(); - probeTranslation.load(m_settings.getProbetranslationfile()); - - } - - public void analyze() throws IOException, DataFormatException, Exception { - System.out.println(""); - System.out.println("Starting analysis!"); - - String[] datasets = new String[m_settings.getDatasetnames().size()]; - for (int i = 0; i < m_settings.getDatasetnames().size(); i++) { - datasets[i] = m_settings.getDatasetnames().get(i); - } - - if (!m_settings.getOutput().endsWith("/")) { - m_settings.setOutput(m_settings.getOutput() + "/MetaAnalysis/"); - } - - if (!Gpio.exists(m_settings.getOutput())) { - Gpio.createDir(m_settings.getOutput()); - } - m_settings.save(); - - String[] locations = new String[m_settings.getDatasetnames().size()]; - for (int i = 0; i < locations.length; i++) { - locations[i] = m_settings.getDatasetlocations().get(i); - } - - int permstart = 0; - int permstop = m_settings.getNrPermutations() + 1; - - if (m_settings.getRunonlypermutation() > -1) { - permstart = m_settings.getRunonlypermutation(); - permstop = m_settings.getRunonlypermutation() + m_settings.getNrPermutations(); - } - - for (int perm = permstart; perm < permstop; perm++) { - ds = new BinaryResultDataset[m_settings.getDatasetlocations().size()]; - runCalculationRound(perm, locations, datasets, -1); - } - - if (m_settings.getRunonlypermutation() == -1) { - - if (m_settings.getNrPermutations() > 0) { - FDR.calculateFDR(m_settings.getOutput(), m_settings.getNrPermutations(), m_settings.getFinalEQTLBufferMaxLength(), m_settings.getFdrthreshold(), true, null, null, FDR.FDRMethod.ALL, true); - EQTLDotPlot edp = new EQTLDotPlot(); - edp.draw(m_settings.getOutput() + "/eQTLsFDR" + m_settings.getFdrthreshold() + ".txt", m_settings.getOutput() + "/DotPlot-FDR" + m_settings.getFdrthreshold() + ".pdf", EQTLDotPlot.Output.PDF); // "/eQTLsFDR" + fdrCutOff + ".txt", outputReportsDir + "/eQTLsFDR" + fdrCutOff + "DotPlot.png" - edp = null; - } - } - - } - - protected void initdatasets(String[] locations, int perm, int dToUse) throws IOException { - - int numProbes = probeTranslation.getNumProbes(); - System.out.println(numProbes + " probes found in translation table. Now matching probes across datasets.."); - probeTranslationLookupTable = new Integer[ds.length][numProbes]; - HashSet probesPresentInDatasets = new HashSet(); + protected static MetaSettings m_settings; + protected BinaryResultDataset[] ds; + protected ArrayList probes; + protected ArrayList snps; + protected Integer[][] snpTranslation; + protected int[] pvaluedistribution; + protected EQTL[] eQTLBuffer; + protected EQTL[] finalEQTLBuffer; + protected int nrInFinalBuffer = 0; + protected double pvaluethreshold; + protected ArrayList snpChr; + protected ArrayList snpChrPos; + protected ProbeTranslation probeTranslation; + protected Integer[][] probeTranslationLookupTable; + public static String header = "PValue\t" + + "SNPName\t" + + "SNPChr\t" + + "SNPChrPos\t" + + "ProbeName\t" + + "ProbeChr\t" + + "ProbeCenterChrPos\t" + + "CisTrans\t" + + "SNPType\t" + + "AlleleAssessed\t" + + "OverallZScore\t" + + "DatasetsWhereSNPProbePairIsAvailableAndPassesQC\t" + + "DatasetsZScores\t" + + "DatasetsNrSamples\t" + + "IncludedDatasetsMeanProbeExpression\t" + + "IncludedDatasetsProbeExpressionVariance\t" + + "HGNCName\t" + + "IncludedDatasetsCorrelationCoefficient"; + protected double[] zsumPerSNP; + protected int[] zsumSNPsNumberOfProbes; + protected double[] zsumPerProbe; + protected int[] zsumProbesNumberOfSNPs; + protected ZScorePlot zs; + protected TextFile zscoretable; + protected HashSet uniqueProbes; + protected HashSet uniqueSNPs; + protected int nrTotalSamples; + protected int numSNPs; + protected int numProbes; + private HashSet probeListToAnalyze; + + public void init(String settingsFile, String texttoreplace, String replacetextwith) throws IOException { + m_settings = new MetaSettings(); + m_settings.parse(settingsFile, texttoreplace, replacetextwith); + probeTranslation = new ProbeTranslation(); + probeTranslation.load(m_settings.getProbetranslationfile()); + + } + + public void analyze() throws IOException, DataFormatException, Exception { + System.out.println(""); + System.out.println("Starting analysis!"); + + String[] datasets = new String[m_settings.getDatasetnames().size()]; + for (int i = 0; i < m_settings.getDatasetnames().size(); i++) { + datasets[i] = m_settings.getDatasetnames().get(i); + } + + if (!m_settings.getOutput().endsWith("/")) { + m_settings.setOutput(m_settings.getOutput() + "/MetaAnalysis/"); + } + + if (!Gpio.exists(m_settings.getOutput())) { + Gpio.createDir(m_settings.getOutput()); + } + m_settings.save(); + + String[] locations = new String[m_settings.getDatasetnames().size()]; + for (int i = 0; i < locations.length; i++) { + locations[i] = m_settings.getDatasetlocations().get(i); + } + + int permstart = 0; + int permstop = m_settings.getNrPermutations() + 1; + + if (m_settings.getRunonlypermutation() > -1) { + permstart = m_settings.getRunonlypermutation(); + permstop = m_settings.getRunonlypermutation() + m_settings.getNrPermutations(); + } + + System.out.println(permstart + " - " + permstop); + + for (int perm = permstart; perm < permstop; perm++) { + ds = new BinaryResultDataset[m_settings.getDatasetlocations().size()]; + runCalculationRound(perm, locations, datasets, -1); + } + + if (m_settings.getRunonlypermutation() == -1) { + + if (m_settings.getNrPermutations() > 0) { + FDR.calculateFDR(m_settings.getOutput(), m_settings.getNrPermutations(), m_settings.getFinalEQTLBufferMaxLength(), m_settings.getFdrthreshold(), true, null, null, FDR.FDRMethod.ALL, true); + EQTLDotPlot edp = new EQTLDotPlot(); + edp.draw(m_settings.getOutput() + "/eQTLsFDR" + m_settings.getFdrthreshold() + ".txt", m_settings.getOutput() + "/DotPlot-FDR" + m_settings.getFdrthreshold() + ".pdf", EQTLDotPlot.Output.PDF); // "/eQTLsFDR" + fdrCutOff + ".txt", outputReportsDir + "/eQTLsFDR" + fdrCutOff + "DotPlot.png" + edp = null; + } + } + + } + + protected void initdatasets(String[] locations, int perm, int dToUse) throws IOException { + + int numProbes = probeTranslation.getNumProbes(); + System.out.println(numProbes + " probes found in translation table. Now matching probes across datasets.."); + probeTranslationLookupTable = new Integer[ds.length][numProbes]; + HashSet probesPresentInDatasets = new HashSet(); // m_settings.getSNPSelection(); - HashSet selectedSNPs = null; + HashSet selectedSNPs = null; // if (m_settings.getSNPSelection() != null) { // System.out.println("Selecting SNPs from: " + m_settings.getSNPSelection()); @@ -153,93 +155,93 @@ protected void initdatasets(String[] locations, int perm, int dToUse) throws IOE // System.out.println("Selected " + selectedSNPs.size() + " unique SNPs from file."); // } - HashMap> selectedSNPProbePairs = null; - if (m_settings.getSNPProbeSelection() != null) { - System.out.println("Selecting SNP-probe pairs from: " + m_settings.getSNPProbeSelection()); - selectedSNPProbePairs = new HashMap>(); - selectedSNPs = new HashSet(); - TextFile stf = new TextFile(m_settings.getSNPProbeSelection(), TextFile.R); - int ctr = 0; - String[] felems = stf.readLineElems(TextFile.tab); - while (felems != null) { - String snp = felems[0]; - String probe = felems[1]; - HashSet probesForSNP = selectedSNPProbePairs.get(snp); - if (probesForSNP == null) { - probesForSNP = new HashSet(); - } - probesForSNP.add(probe); - selectedSNPs.add(snp); - selectedSNPProbePairs.put(snp, probesForSNP); - ctr++; - felems = stf.readLineElems(TextFile.tab); - } - - stf.close(); - System.out.println("Selected " + ctr + " unique SNPs from file."); - } - - HashSet probesToInclude = null; - - if (m_settings.getProbeselection() != null) { - TextFile tf = new TextFile(m_settings.getProbeselection(), TextFile.R); - - ArrayList probesSelected = tf.readAsArrayList(); - - probesToInclude = new HashSet(); - probesToInclude.addAll(probesSelected); - System.out.println(probesSelected.size() +" probes selected from file: "+m_settings.getProbeselection()); - tf.close(); - } - - for (int d = 0; d < ds.length; d++) { - - int probeAnnotationToUse = d; - if (dToUse != -1) { - probeAnnotationToUse = dToUse; - } - - ds[d] = new BinaryResultDataset(locations[d], m_settings.getDatasetPrefix().get(probeAnnotationToUse), perm); - BinaryResultProbe[] dsProbes = ds[d].getProbes(); - BinaryResultSNP[] dsSNPs = ds[d].getSnps(); - nrTotalSamples += ds[d].getMaxNrSamples(); - - for (BinaryResultProbe p : dsProbes) { - Integer newProbeId = probeTranslation.getProbeId(m_settings.getDatasetannotations().get(probeAnnotationToUse) + p.getName()); - if (newProbeId == null) { - System.out.println(m_settings.getDatasetannotations().get(probeAnnotationToUse) + "\t" + p.getName() + " probe not present in annotationfile...?"); - System.exit(0); - } - if (probesToInclude == null || probesToInclude.contains("" + newProbeId)) { - probesPresentInDatasets.add(newProbeId); - probeTranslationLookupTable[d][newProbeId] = p.getId(); - } else { - probeTranslationLookupTable[d][newProbeId] = null; - } - } - - for (BinaryResultSNP s : dsSNPs) { - if (!uniqueSNPs.contains(s.getName()) && (selectedSNPs == null || selectedSNPs.contains(s.getName()))) { - snps.add(s.getName()); - snpChr.add(s.getChr()); - snpChrPos.add(s.getChrpos()); - uniqueSNPs.add(s.getName()); - } - } + HashMap> selectedSNPProbePairs = null; + if (m_settings.getSNPProbeSelection() != null) { + System.out.println("Selecting SNP-probe pairs from: " + m_settings.getSNPProbeSelection()); + selectedSNPProbePairs = new HashMap>(); + selectedSNPs = new HashSet(); + TextFile stf = new TextFile(m_settings.getSNPProbeSelection(), TextFile.R); + int ctr = 0; + String[] felems = stf.readLineElems(TextFile.tab); + while (felems != null) { + String snp = felems[0].intern(); + String probe = felems[1].intern(); + HashSet probesForSNP = selectedSNPProbePairs.get(snp); + if (probesForSNP == null) { + probesForSNP = new HashSet(); + } + probesForSNP.add(probe.intern()); + selectedSNPs.add(snp.intern()); + selectedSNPProbePairs.put(snp.intern(), probesForSNP); + ctr++; + felems = stf.readLineElems(TextFile.tab); + } + + stf.close(); + System.out.println("Selected " + ctr + " unique SNPs from file."); + } + + HashSet probesToInclude = null; + + if (m_settings.getProbeselection() != null) { + TextFile tf = new TextFile(m_settings.getProbeselection(), TextFile.R); + + ArrayList probesSelected = tf.readAsArrayList(); + + probesToInclude = new HashSet(); + probesToInclude.addAll(probesSelected); + System.out.println(probesSelected.size() + " probes selected from file: " + m_settings.getProbeselection()); + tf.close(); + } + + for (int d = 0; d < ds.length; d++) { + + int probeAnnotationToUse = d; + if (dToUse != -1) { + probeAnnotationToUse = dToUse; + } + + ds[d] = new BinaryResultDataset(locations[d], m_settings.getDatasetPrefix().get(probeAnnotationToUse), perm); + BinaryResultProbe[] dsProbes = ds[d].getProbes(); + BinaryResultSNP[] dsSNPs = ds[d].getSnps(); + nrTotalSamples += ds[d].getMaxNrSamples(); + + for (BinaryResultProbe p : dsProbes) { + Integer newProbeId = probeTranslation.getProbeId(m_settings.getDatasetannotations().get(probeAnnotationToUse) + p.getName()); + if (newProbeId == null) { + System.out.println(m_settings.getDatasetannotations().get(probeAnnotationToUse) + "\t" + p.getName() + " probe not present in annotationfile...?"); + System.exit(0); + } + if (probesToInclude == null || probesToInclude.contains("" + newProbeId)) { + probesPresentInDatasets.add(newProbeId); + probeTranslationLookupTable[d][newProbeId] = p.getId(); + } else { + probeTranslationLookupTable[d][newProbeId] = null; + } + } + + for (BinaryResultSNP s : dsSNPs) { + if (!uniqueSNPs.contains(s.getName().intern()) && (selectedSNPs == null || selectedSNPs.contains(s.getName().intern()))) { + snps.add(s.getName().intern()); + snpChr.add(s.getChr()); + snpChrPos.add(s.getChrpos()); + uniqueSNPs.add(s.getName().intern()); + } + } // ds[d].clearProbeObjects(); - } + } - TextFile probesPresentFile = new TextFile(m_settings.getOutput() + "ProbesPresentInAtLeastOneDataset.txt", TextFile.W); + TextFile probesPresentFile = new TextFile(m_settings.getOutput() + "ProbesPresentInAtLeastOneDataset.txt", TextFile.W); - System.out.println(probesPresentInDatasets.size() + "\tunique probes present in all datasets."); - Integer[] presentNrs = probesPresentInDatasets.toArray(new Integer[0]); - for (Integer i : presentNrs) { - probesPresentFile.writeln("" + i); - } - probesPresentFile.close(); + System.out.println(probesPresentInDatasets.size() + "\tunique probes present in all datasets."); + Integer[] presentNrs = probesPresentInDatasets.toArray(new Integer[0]); + for (Integer i : presentNrs) { + probesPresentFile.writeln("" + i); + } + probesPresentFile.close(); - int selectedprobes = 0; + int selectedprobes = 0; // if (m_settings.getProbeselection() != null) { @@ -280,258 +282,255 @@ protected void initdatasets(String[] locations, int perm, int dToUse) throws IOE // System.out.println(probePresenceCounter + "\tprobes selected."); // // } else { - for (int q = 0; q < probeTranslationLookupTable[0].length; q++) { - int probePresenceCounter = 0; - if (probeListToAnalyze != null) { - if (!probeListToAnalyze.contains("" + q)) { - for (int d = 0; d < ds.length; d++) { - probeTranslationLookupTable[d][q] = null; - } - } - } - - for (int i = 0; i < ds.length; i++) { - if (probeTranslationLookupTable[i][q] != null && ds[i].getMaxNrSamples() > m_settings.getProbeAndSNPPresenceFilterSampleThreshold()) { - probePresenceCounter++; - } - } - - - - if (m_settings.getProbeDatasetPresenceThreshold() > 0 && probePresenceCounter < m_settings.getProbeDatasetPresenceThreshold()) { - for (int d = 0; d < ds.length; d++) { - probeTranslationLookupTable[d][q] = null; - } - } else if (probePresenceCounter > 0) { - selectedprobes++; - } - } - System.out.println("Selected " + selectedprobes + " probes from at least " + m_settings.getProbeDatasetPresenceThreshold() + " datasets of at least " + m_settings.getProbeAndSNPPresenceFilterSampleThreshold() + " samples."); + for (int q = 0; q < probeTranslationLookupTable[0].length; q++) { + int probePresenceCounter = 0; + if (probeListToAnalyze != null) { + if (!probeListToAnalyze.contains("" + q)) { + for (int d = 0; d < ds.length; d++) { + probeTranslationLookupTable[d][q] = null; + } + } + } + + for (int i = 0; i < ds.length; i++) { + if (probeTranslationLookupTable[i][q] != null && ds[i].getMaxNrSamples() > m_settings.getProbeAndSNPPresenceFilterSampleThreshold()) { + probePresenceCounter++; + } + } + + + if (m_settings.getProbeDatasetPresenceThreshold() > 0 && probePresenceCounter < m_settings.getProbeDatasetPresenceThreshold()) { + for (int d = 0; d < ds.length; d++) { + probeTranslationLookupTable[d][q] = null; + } + } else if (probePresenceCounter > 0) { + selectedprobes++; + } + } + System.out.println("Selected " + selectedprobes + " probes from at least " + m_settings.getProbeDatasetPresenceThreshold() + " datasets of at least " + m_settings.getProbeAndSNPPresenceFilterSampleThreshold() + " samples."); // } // numProbes = uniqueProbes.size(); - numSNPs = uniqueSNPs.size(); + numSNPs = uniqueSNPs.size(); - initSNPTranslation(); - } + initSNPTranslation(); + } - protected void initSNPTranslation() throws IOException { - snpTranslation = new Integer[ds.length][numSNPs]; + protected void initSNPTranslation() throws IOException { + snpTranslation = new Integer[ds.length][numSNPs]; - for (int d = 0; d < ds.length; d++) { - BinaryResultProbe[] dsProbes = ds[d].getProbes(); - BinaryResultSNP[] dsSNPs = ds[d].getSnps(); + for (int d = 0; d < ds.length; d++) { + BinaryResultProbe[] dsProbes = ds[d].getProbes(); + BinaryResultSNP[] dsSNPs = ds[d].getSnps(); - for (int i = 0; i < snps.size(); i++) { - BinaryResultSNP s = ds[d].getStringToSNP().get(snps.get(i)); - if (s != null) { - snpTranslation[d][i] = s.getId(); - } else { - snpTranslation[d][i] = null; - } - } - } + for (int i = 0; i < snps.size(); i++) { + BinaryResultSNP s = ds[d].getStringToSNP().get(snps.get(i)); + if (s != null) { + snpTranslation[d][i] = s.getId(); + } else { + snpTranslation[d][i] = null; + } + } + } - int selectedsnps = 0; + int selectedsnps = 0; - HashSet selectedSNPs = null; - if (m_settings.getSNPSelection() != null) { - System.out.println("Selecting SNPs from: " + m_settings.getSNPSelection()); - selectedSNPs = new HashSet(); - TextFile stf = new TextFile(m_settings.getSNPSelection(), TextFile.R); - selectedSNPs.addAll(stf.readAsArrayList()); - stf.close(); - System.out.println("Selected " + selectedSNPs.size() + " unique SNPs from file."); - } + HashSet selectedSNPs = null; + if (m_settings.getSNPSelection() != null) { + System.out.println("Selecting SNPs from: " + m_settings.getSNPSelection()); + selectedSNPs = new HashSet(); + TextFile stf = new TextFile(m_settings.getSNPSelection(), TextFile.R); + selectedSNPs.addAll(stf.readAsArrayList()); + stf.close(); + System.out.println("Selected " + selectedSNPs.size() + " unique SNPs from file."); + } - TextFile selectedSNPFile = new TextFile(m_settings.getOutput() + "/SelectedSNPs.txt", TextFile.W); - for (int s = 0; s < numSNPs; s++) { + TextFile selectedSNPFile = new TextFile(m_settings.getOutput() + "/SelectedSNPs.txt", TextFile.W); + for (int s = 0; s < numSNPs; s++) { - String snpName = snps.get(s); + String snpName = snps.get(s); - int snppresencecounter = 0; - for (int d = 0; d < ds.length; d++) { - if (snpTranslation[d][s] != null && ds[d].getMaxNrSamples() >= m_settings.getProbeAndSNPPresenceFilterSampleThreshold()) { - snppresencecounter++; - } - } + int snppresencecounter = 0; + for (int d = 0; d < ds.length; d++) { + if (snpTranslation[d][s] != null && ds[d].getMaxNrSamples() >= m_settings.getProbeAndSNPPresenceFilterSampleThreshold()) { + snppresencecounter++; + } + } - if (m_settings.getSnpDatasetPresenceThreshold() > 0 && snppresencecounter < m_settings.getSnpDatasetPresenceThreshold() || (selectedSNPs != null && !selectedSNPs.contains(snpName))) { - for (int d = 0; d < ds.length; d++) { - snpTranslation[d][s] = null; - } - } else if (snppresencecounter > 0) { - selectedSNPFile.writeln(snps.get(s)); - selectedsnps++; - } + if (m_settings.getSnpDatasetPresenceThreshold() > 0 && snppresencecounter < m_settings.getSnpDatasetPresenceThreshold() || (selectedSNPs != null && !selectedSNPs.contains(snpName))) { + for (int d = 0; d < ds.length; d++) { + snpTranslation[d][s] = null; + } + } else if (snppresencecounter > 0) { + selectedSNPFile.writeln(snps.get(s)); + selectedsnps++; + } - } + } - selectedSNPFile.close(); + selectedSNPFile.close(); - System.out.println("Selected " + selectedsnps + " snps from at least " + m_settings.getSnpDatasetPresenceThreshold() + " datasets of at least " + m_settings.getProbeAndSNPPresenceFilterSampleThreshold() + " samples."); - } + System.out.println("Selected " + selectedsnps + " snps from at least " + m_settings.getSnpDatasetPresenceThreshold() + " datasets of at least " + m_settings.getProbeAndSNPPresenceFilterSampleThreshold() + " samples."); + } - protected void runCalculationRound(int perm, String[] locations, String[] datasets, int dToUse) throws IOException, Exception { - pvaluedistribution = null; - eQTLBuffer = null; - finalEQTLBuffer = null; - nrInFinalBuffer = 0; + protected void runCalculationRound(int perm, String[] locations, String[] datasets, int dToUse) throws IOException, Exception { + pvaluedistribution = null; + eQTLBuffer = null; + finalEQTLBuffer = null; + nrInFinalBuffer = 0; - uniqueProbes = new HashSet(); - uniqueSNPs = new HashSet(); + uniqueProbes = new HashSet(); + uniqueSNPs = new HashSet(); - int numDatasets = ds.length; - probes = new ArrayList(); + int numDatasets = ds.length; + probes = new ArrayList(); - snps = new ArrayList(); - snpChr = new ArrayList(); - snpChrPos = new ArrayList(); + snps = new ArrayList(); + snpChr = new ArrayList(); + snpChrPos = new ArrayList(); - nrTotalSamples = 0; + nrTotalSamples = 0; - String[] probeName = probeTranslation.getProbes(); - probes.addAll(Arrays.asList(probeName)); + String[] probeName = probeTranslation.getProbes(); + probes.addAll(Arrays.asList(probeName)); - initdatasets(locations, perm, dToUse); + initdatasets(locations, perm, dToUse); - String zsName = null; - if (m_settings.isMakezscoreplot()) { - zs = new ZScorePlot(); - String[] datasets2 = new String[datasets.length + 1]; - System.arraycopy(datasets, 0, datasets2, 0, datasets.length); - datasets2[datasets2.length - 1] = "Meta-Analysis"; + String zsName = null; + if (m_settings.isMakezscoreplot()) { + zs = new ZScorePlot(); + String[] datasets2 = new String[datasets.length + 1]; + System.arraycopy(datasets, 0, datasets2, 0, datasets.length); + datasets2[datasets2.length - 1] = "Meta-Analysis"; - if (perm > 0) { - zsName = m_settings.getOutput() + "ZScoreComparison-PermutationRound" + perm; - } else { - zsName = m_settings.getOutput() + "ZScoreComparison"; - } - zs.init(numDatasets + 1, datasets2, true, zsName); - } + if (perm > 0) { + zsName = m_settings.getOutput() + "ZScoreComparison-PermutationRound" + perm; + } else { + zsName = m_settings.getOutput() + "ZScoreComparison"; + } + zs.init(numDatasets + 1, datasets2, true, zsName); + } - Descriptives.lookupSqrt(nrTotalSamples); - pvaluedistribution = new int[m_settings.getNrOfBins()]; + Descriptives.lookupSqrt(nrTotalSamples); + pvaluedistribution = new int[m_settings.getNrOfBins()]; - eQTLBuffer = new EQTL[10000]; - finalEQTLBuffer = new EQTL[0]; + eQTLBuffer = new EQTL[10000]; + finalEQTLBuffer = new EQTL[0]; - pvaluethreshold = Double.MAX_VALUE; + pvaluethreshold = Double.MAX_VALUE; - zsumPerSNP = new double[snps.size()]; - zsumSNPsNumberOfProbes = new int[snps.size()]; - zsumPerProbe = new double[probes.size()]; - zsumProbesNumberOfSNPs = new int[probes.size()]; + zsumPerSNP = new double[snps.size()]; + zsumSNPsNumberOfProbes = new int[snps.size()]; + zsumPerProbe = new double[probes.size()]; + zsumProbesNumberOfSNPs = new int[probes.size()]; - System.out.println("Performing the meta-analysis now: "); + System.out.println("Performing the meta-analysis now: "); // System.out.println(snps.size() + "\t unique SNPs present in at least " + m_settings.snpDatasetPresenceThreshold + " datasets"); // System.out.println(probes.size() + "\t unique Probespresent in at least " + m_settings.probeDatasetPresenceThreshold + " datasets"); - System.out.println(nrTotalSamples + "\t total samples"); - - if (m_settings.isMakezscoretable()) { - if (perm == 0) { - zscoretable = new TextFile(m_settings.getOutput() + "metazscoretable.txt.gz", TextFile.W, (10 * 1048576)); - } else { - zscoretable = new TextFile(m_settings.getOutput() + "metazscoretable-Permutation" + perm + ".txt.gz", TextFile.W, (10 * 1048576)); - } - StringBuilder zscoreout = new StringBuilder(); - zscoreout.append("SNP\tAlleleCoding\tAssessedAllele"); - for (int i = 0; i < probes.size(); i++) { - zscoreout.append("\t").append(probes.get(i)); - } - zscoretable.writeln(zscoreout.toString()); - - } - - HashMap> selectedSNPProbePairs = null; - if (m_settings.getSNPProbeSelection() != null) { - System.out.println("Selecting SNP-probe pairs from: " + m_settings.getSNPProbeSelection()); - selectedSNPProbePairs = new HashMap>(); - - TextFile stf = new TextFile(m_settings.getSNPProbeSelection(), TextFile.R); - int ctr = 0; - String[] felems = stf.readLineElems(TextFile.tab); - while (felems != null) { - String snp = felems[0]; - String probe = felems[1]; - HashSet probesForSNP = selectedSNPProbePairs.get(snp); - if (probesForSNP == null) { - probesForSNP = new HashSet(); - } - probesForSNP.add(probe); - selectedSNPProbePairs.put(snp, probesForSNP); - ctr++; - felems = stf.readLineElems(TextFile.tab); - } - - stf.close(); - System.out.println("Selected " + ctr + " unique SNPs from file."); - } - - /// init calculation pool, - - int nrProcs = Runtime.getRuntime().availableProcessors(); - if (m_settings.getNrThresds() > 0) { - if (m_settings.getNrThresds() > nrProcs) { - m_settings.setNrThresds(nrProcs); - } - nrProcs = m_settings.getNrThresds(); - } - System.out.println("Using " + nrProcs + " threads :)"); - MetaAnalysisCalculationThread[] calcPool = new MetaAnalysisCalculationThread[nrProcs]; - LinkedBlockingQueue loaderQueue = new LinkedBlockingQueue(nrProcs); - MetaAnalysisLoaderThread loaderThread = new MetaAnalysisLoaderThread(loaderQueue, snpTranslation, snps, ds); - loaderThread.setName("Loader"); - loaderThread.start(); - - PValueThreshold p = new PValueThreshold(); - LinkedBlockingQueue resultQueue = new LinkedBlockingQueue(nrProcs); - MetaAnalysisResultThread resultThread = new MetaAnalysisResultThread(resultQueue, m_settings, datasets, perm, zscoretable, p, snps, selectedSNPProbePairs, probes); - resultThread.setName("Result"); - resultThread.start(); - - for (int i = 0; i < nrProcs; i++) { - calcPool[i] = new MetaAnalysisCalculationThread(loaderQueue, resultQueue, snps, probes, snpChr, snpChrPos, ds, snpTranslation, probeTranslationLookupTable, probeTranslation, m_settings, zs, p); - calcPool[i].setName("MetaCalc-" + i); - calcPool[i].start(); - } - - // kill the threads - try { - loaderThread.join(); - MetaAnalysisWorkPackage poison = new MetaAnalysisWorkPackage(0, 0); - poison.poisonTheWell(); - - for (int threadNum = 0; threadNum < calcPool.length; threadNum++) { - try { - loaderQueue.put(poison); - } catch (InterruptedException ex) { - ex.printStackTrace(); - } - } - for (int threadNum = 0; threadNum < calcPool.length; threadNum++) { - calcPool[threadNum].join(); - } - - resultQueue.put(poison); - resultThread.join(); - - } catch (InterruptedException e) { - System.err.println("Exception: Thread main interrupted."); - } - - if (m_settings.isMakezscoretable()) { + System.out.println(nrTotalSamples + "\t total samples"); + + if (m_settings.isMakezscoretable()) { + if (perm == 0) { + zscoretable = new TextFile(m_settings.getOutput() + "metazscoretable.txt.gz", TextFile.W, (10 * 1048576)); + } else { + zscoretable = new TextFile(m_settings.getOutput() + "metazscoretable-Permutation" + perm + ".txt.gz", TextFile.W, (10 * 1048576)); + } + StringBuilder zscoreout = new StringBuilder(); + zscoreout.append("SNP\tAlleleCoding\tAssessedAllele"); + for (int i = 0; i < probes.size(); i++) { + zscoreout.append("\t").append(probes.get(i)); + } + zscoretable.writeln(zscoreout.toString()); + + } + + HashMap> selectedSNPProbePairs = null; + if (m_settings.getSNPProbeSelection() != null) { + System.out.println("Selecting SNP-probe pairs from: " + m_settings.getSNPProbeSelection()); + selectedSNPProbePairs = new HashMap>(); + + TextFile stf = new TextFile(m_settings.getSNPProbeSelection(), TextFile.R); + int ctr = 0; + String[] felems = stf.readLineElems(TextFile.tab); + while (felems != null) { + String snp = felems[0]; + String probe = felems[1]; + HashSet probesForSNP = selectedSNPProbePairs.get(snp); + if (probesForSNP == null) { + probesForSNP = new HashSet(); + } + probesForSNP.add(probe); + selectedSNPProbePairs.put(snp, probesForSNP); + ctr++; + felems = stf.readLineElems(TextFile.tab); + } + + stf.close(); + System.out.println("Selected " + ctr + " unique SNPs from file."); + } + + /// init calculation pool, + + int nrProcs = Runtime.getRuntime().availableProcessors(); + if (m_settings.getNrThresds() > 0) { + if (m_settings.getNrThresds() > nrProcs) { + m_settings.setNrThresds(nrProcs); + } + nrProcs = m_settings.getNrThresds(); + } + System.out.println("Using " + nrProcs + " threads :)"); + MetaAnalysisCalculationThread[] calcPool = new MetaAnalysisCalculationThread[nrProcs]; + LinkedBlockingQueue loaderQueue = new LinkedBlockingQueue(nrProcs); + MetaAnalysisLoaderThread loaderThread = new MetaAnalysisLoaderThread(loaderQueue, snpTranslation, snps, ds); + loaderThread.setName("Loader"); + loaderThread.start(); + + PValueThreshold p = new PValueThreshold(); + LinkedBlockingQueue resultQueue = new LinkedBlockingQueue(nrProcs); + MetaAnalysisResultThread resultThread = new MetaAnalysisResultThread(resultQueue, m_settings, datasets, perm, zscoretable, p, snps, selectedSNPProbePairs, probes); + resultThread.setName("Result"); + resultThread.start(); + + for (int i = 0; i < nrProcs; i++) { + calcPool[i] = new MetaAnalysisCalculationThread(loaderQueue, resultQueue, snps, probes, snpChr, snpChrPos, ds, snpTranslation, probeTranslationLookupTable, probeTranslation, m_settings, zs, p); + calcPool[i].setName("MetaCalc-" + i); + calcPool[i].start(); + } + + // kill the threads + try { + loaderThread.join(); + MetaAnalysisWorkPackage poison = new MetaAnalysisWorkPackage(0, 0); + poison.poisonTheWell(); + + for (int threadNum = 0; threadNum < calcPool.length; threadNum++) { + try { + loaderQueue.put(poison); + } catch (InterruptedException ex) { + ex.printStackTrace(); + } + } + for (int threadNum = 0; threadNum < calcPool.length; threadNum++) { + calcPool[threadNum].join(); + } + + resultQueue.put(poison); + resultThread.join(); + + } catch (InterruptedException e) { + System.err.println("Exception: Thread main interrupted."); + } + + if (m_settings.isMakezscoretable()) { // if (perm == 0) { - zscoretable.close(); + zscoretable.close(); // } - } - if (zs != null) { - zs.write(zsName); - } - - + } + if (zs != null) { + zs.write(zsName); + } - } + } } \ No newline at end of file diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaSettings.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaSettings.java index b59e72955..1e553f7aa 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaSettings.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/MetaSettings.java @@ -4,497 +4,508 @@ */ package eqtlmappingpipeline.binarymeta.meta; +import org.apache.commons.configuration.ConfigurationException; +import org.apache.commons.configuration.XMLConfiguration; + import java.util.ArrayList; import java.util.logging.Level; import java.util.logging.Logger; -import org.apache.commons.configuration.ConfigurationException; -import org.apache.commons.configuration.XMLConfiguration; /** - * * @author harm-jan */ public class MetaSettings { - private int nrPermutations = 10; - private boolean useAbsoluteZscore = false; - private int finalEQTLBufferMaxLength = 1000000; - private int nrOfBins = 100; - private double fdrthreshold = 0.05; - private boolean includeSNPsWithoutProperMapping = true; - private boolean includeProbesWithoutProperMapping = true; - private boolean cis = true; - private boolean trans = true; - private int cisdistance = 250000; - private int transdistance = 5000000; - private boolean makezscoreplot = true; - private String probetranslationfile; - private ArrayList datasetnames; - private ArrayList datasetPrefix; - private ArrayList datasetlocations; - private ArrayList datasetannotations; - private ArrayList selectedProbes; - private String output; - private boolean makezscoretable = false; - private int probeDatasetPresenceThreshold = 0; - private int snpDatasetPresenceThreshold = 0; - private int probeAndSNPPresenceFilterSampleThreshold = 0; - private int runonlypermutation; - private int nrThresds; - private String probeselection; - private String snpselection; - private XMLConfiguration config; - private String snpprobeselection; - - public void parse(String settings, String texttoreplace, String replacetextwith) { - try { - config = new XMLConfiguration(settings); - - nrPermutations = config.getInt("defaults.permutations", 0); - - useAbsoluteZscore = config.getBoolean("defaults.absolutezscore", false); - finalEQTLBufferMaxLength = config.getInt("defaults.finalnreqtls", 100000); - fdrthreshold = config.getDouble("defaults.fdrthreshold", 0.05); - cisdistance = config.getInt("defaults.cisprobedistance", 250000); - transdistance = config.getInt("defaults.transprobedistance", 5000000); - includeProbesWithoutProperMapping = config.getBoolean("defaults.includeprobeswithoutmapping", true); - includeSNPsWithoutProperMapping = config.getBoolean("defaults.includesnpswithoutmapping", true); - makezscoreplot = config.getBoolean("defaults.makezscoreplot", true); - makezscoretable = config.getBoolean("defaults.makezscoretable", false); - probetranslationfile = config.getString("defaults.probetranslationfile"); - output = config.getString("defaults.output"); - - - - - probeDatasetPresenceThreshold = config.getInt("defaults.minimalnumberofdatasetsthatcontainprobe", 0); - snpDatasetPresenceThreshold = config.getInt("defaults.minimalnumberofdatasetsthatcontainsnp", 0); - probeAndSNPPresenceFilterSampleThreshold = config.getInt("defaults.snpprobeselectsamplesizethreshold", -1); - - runonlypermutation = config.getInt("defaults.runonlypermutation", -1); - nrThresds = config.getInt("defaults.threads", 0); - cis = config.getBoolean("defaults.cis", false); - trans = config.getBoolean("defaults.trans", false); - - probeselection = config.getString("defaults.probeselection"); - - if (probeselection != null && probeselection.trim().length() == 0) { - probeselection = null; - } - snpselection = config.getString("defaults.snpselection"); - - if (snpselection != null && snpselection.trim().length() == 0) { - snpselection = null; - } - - snpprobeselection = config.getString("defaults.snpprobeselection"); - - if (snpprobeselection != null && snpprobeselection.trim().length() == 0) { - snpprobeselection = null; - } else { - System.out.println("SNP PROBE SELECTION: "+snpprobeselection); - } - - - - int i = 0; - - String dataset = ""; - datasetnames = new ArrayList(); - datasetlocations = new ArrayList(); - datasetannotations = new ArrayList(); - datasetPrefix = new ArrayList(); - - while (dataset != null) { - dataset = config.getString("datasets.dataset(" + i + ").name"); // see if a dataset is defined - if (dataset != null) { - - datasetnames.add(dataset); - String prefix = config.getString("datasets.dataset(" + i + ").prefix"); // see if a dataset is defined - - if (prefix == null) { - prefix = "Dataset"; - } - datasetPrefix.add(prefix); - String datasetlocation = config.getString("datasets.dataset(" + i + ").location"); // see if a dataset is defined - if (texttoreplace != null && replacetextwith != null && datasetlocation.contains(texttoreplace)) { - datasetlocation = datasetlocation.replace(texttoreplace, replacetextwith); - } - String datasetannotation = config.getString("datasets.dataset(" + i + ").expressionplatform"); // see if a dataset is defined - - datasetlocations.add(datasetlocation); - datasetannotations.add(datasetannotation); - } - i++; - } - - - // parse datasets - } catch (ConfigurationException e) { - e.printStackTrace(); - } - } - - /** - * @return the nrPermutations - */ - public int getNrPermutations() { - return nrPermutations; - } - - /** - * @param nrPermutations the nrPermutations to set - */ - public void setNrPermutations(int nrPermutations) { - this.nrPermutations = nrPermutations; - } - - /** - * @return the useAbsoluteZscore - */ - public boolean isUseAbsoluteZscore() { - return useAbsoluteZscore; - } - - /** - * @param useAbsoluteZscore the useAbsoluteZscore to set - */ - public void setUseAbsoluteZscore(boolean useAbsoluteZscore) { - this.useAbsoluteZscore = useAbsoluteZscore; - } - - /** - * @return the finalEQTLBufferMaxLength - */ - public int getFinalEQTLBufferMaxLength() { - return finalEQTLBufferMaxLength; - } - - /** - * @param finalEQTLBufferMaxLength the finalEQTLBufferMaxLength to set - */ - public void setFinalEQTLBufferMaxLength(int finalEQTLBufferMaxLength) { - this.finalEQTLBufferMaxLength = finalEQTLBufferMaxLength; - } - - /** - * @return the nrOfBins - */ - public int getNrOfBins() { - return nrOfBins; - } - - /** - * @param nrOfBins the nrOfBins to set - */ - public void setNrOfBins(int nrOfBins) { - this.nrOfBins = nrOfBins; - } - - /** - * @return the fdrthreshold - */ - public double getFdrthreshold() { - return fdrthreshold; - } - - /** - * @param fdrthreshold the fdrthreshold to set - */ - public void setFdrthreshold(double fdrthreshold) { - this.fdrthreshold = fdrthreshold; - } - - /** - * @return the includeSNPsWithoutProperMapping - */ - public boolean isIncludeSNPsWithoutProperMapping() { - return includeSNPsWithoutProperMapping; - } - - /** - * @param includeSNPsWithoutProperMapping the - * includeSNPsWithoutProperMapping to set - */ - public void setIncludeSNPsWithoutProperMapping(boolean includeSNPsWithoutProperMapping) { - this.includeSNPsWithoutProperMapping = includeSNPsWithoutProperMapping; - } - - /** - * @return the includeProbesWithoutProperMapping - */ - public boolean isIncludeProbesWithoutProperMapping() { - return includeProbesWithoutProperMapping; - } - - /** - * @param includeProbesWithoutProperMapping the - * includeProbesWithoutProperMapping to set - */ - public void setIncludeProbesWithoutProperMapping(boolean includeProbesWithoutProperMapping) { - this.includeProbesWithoutProperMapping = includeProbesWithoutProperMapping; - } - - /** - * @return the cis - */ - public boolean isCis() { - return cis; - } - - /** - * @param cis the cis to set - */ - public void setCis(boolean cis) { - this.cis = cis; - } - - /** - * @return the trans - */ - public boolean isTrans() { - return trans; - } - - /** - * @param trans the trans to set - */ - public void setTrans(boolean trans) { - this.trans = trans; - } - - /** - * @return the cisdistance - */ - public int getCisdistance() { - return cisdistance; - } - - /** - * @param cisdistance the cisdistance to set - */ - public void setCisdistance(int cisdistance) { - this.cisdistance = cisdistance; - } - - /** - * @return the transdistance - */ - public int getTransdistance() { - return transdistance; - } - - /** - * @param transdistance the transdistance to set - */ - public void setTransdistance(int transdistance) { - this.transdistance = transdistance; - } - - /** - * @return the makezscoreplot - */ - public boolean isMakezscoreplot() { - return makezscoreplot; - } - - /** - * @param makezscoreplot the makezscoreplot to set - */ - public void setMakezscoreplot(boolean makezscoreplot) { - this.makezscoreplot = makezscoreplot; - } - - /** - * @return the probetranslationfile - */ - public String getProbetranslationfile() { - return probetranslationfile; - } - - /** - * @param probetranslationfile the probetranslationfile to set - */ - public void setProbetranslationfile(String probetranslationfile) { - this.probetranslationfile = probetranslationfile; - } - - /** - * @return the datasetnames - */ - public ArrayList getDatasetnames() { - return datasetnames; - } - - /** - * @param datasetnames the datasetnames to set - */ - public void setDatasetnames(ArrayList datasetnames) { - this.datasetnames = datasetnames; - } - - /** - * @return the datasetlocations - */ - public ArrayList getDatasetlocations() { - return datasetlocations; - } - - /** - * @param datasetlocations the datasetlocations to set - */ - public void setDatasetlocations(ArrayList datasetlocations) { - this.datasetlocations = datasetlocations; - } - - /** - * @return the datasetannotations - */ - public ArrayList getDatasetannotations() { - return datasetannotations; - } - - /** - * @param datasetannotations the datasetannotations to set - */ - public void setDatasetannotations(ArrayList datasetannotations) { - this.datasetannotations = datasetannotations; - } - - /** - * @return the output - */ - public String getOutput() { - return output; - } - - /** - * @param output the output to set - */ - public void setOutput(String output) { - this.output = output; - } - - /** - * @return the makezscoretable - */ - public boolean isMakezscoretable() { - return makezscoretable; - } - - /** - * @param makezscoretable the makezscoretable to set - */ - public void setMakezscoretable(boolean makezscoretable) { - this.makezscoretable = makezscoretable; - } - - /** - * @return the probeDatasetPresenceThreshold - */ - public int getProbeDatasetPresenceThreshold() { - return probeDatasetPresenceThreshold; - } - - /** - * @param probeDatasetPresenceThreshold the probeDatasetPresenceThreshold to - * set - */ - public void setProbeDatasetPresenceThreshold(int probeDatasetPresenceThreshold) { - this.probeDatasetPresenceThreshold = probeDatasetPresenceThreshold; - } - - /** - * @return the snpDatasetPresenceThreshold - */ - public int getSnpDatasetPresenceThreshold() { - return snpDatasetPresenceThreshold; - } - - /** - * @param snpDatasetPresenceThreshold the snpDatasetPresenceThreshold to set - */ - public void setSnpDatasetPresenceThreshold(int snpDatasetPresenceThreshold) { - this.snpDatasetPresenceThreshold = snpDatasetPresenceThreshold; - } - - /** - * @return the probeAndSNPPresenceFilterSampleThreshold - */ - public int getProbeAndSNPPresenceFilterSampleThreshold() { - return probeAndSNPPresenceFilterSampleThreshold; - } - - /** - * @param probeAndSNPPresenceFilterSampleThreshold the - * probeAndSNPPresenceFilterSampleThreshold to set - */ - public void setProbeAndSNPPresenceFilterSampleThreshold(int probeAndSNPPresenceFilterSampleThreshold) { - this.probeAndSNPPresenceFilterSampleThreshold = probeAndSNPPresenceFilterSampleThreshold; - } - - /** - * @return the runonlypermutation - */ - public int getRunonlypermutation() { - return runonlypermutation; - } - - /** - * @param runonlypermutation the runonlypermutation to set - */ - public void setRunonlypermutation(int runonlypermutation) { - this.runonlypermutation = runonlypermutation; - } - - /** - * @return the nrThresds - */ - public int getNrThresds() { - return nrThresds; - } - - /** - * @param nrThresds the nrThresds to set - */ - public void setNrThresds(int nrThresds) { - this.nrThresds = nrThresds; - } - - ArrayList getDatasetPrefix() { - return datasetPrefix; - } - - /** - * @return the probeselection - */ - public String getProbeselection() { - return probeselection; - } - - /** - * @param probeselection the probeselection to set - */ - public void setProbeselection(String probeselection) { - this.probeselection = probeselection; - } - - public String getSNPSelection() { - return snpselection; - } - - public String getSNPProbeSelection() { - return snpprobeselection; - } - - void save() { - try { - config.save(output + "metasettings.xml"); - } catch (ConfigurationException ex) { - Logger.getLogger(MetaSettings.class.getName()).log(Level.SEVERE, null, ex); - } - - } + private int nrPermutations = 10; + private boolean useAbsoluteZscore = false; + private int finalEQTLBufferMaxLength = 1000000; + private int nrOfBins = 100; + private double fdrthreshold = 0.05; + private boolean includeSNPsWithoutProperMapping = true; + private boolean includeProbesWithoutProperMapping = true; + private boolean cis = true; + private boolean trans = true; + private int cisdistance = 250000; + private int transdistance = 5000000; + private boolean makezscoreplot = true; + private String probetranslationfile; + private ArrayList datasetnames; + private ArrayList datasetPrefix; + private ArrayList datasetlocations; + private ArrayList datasetannotations; + private ArrayList selectedProbes; + private String output; + private boolean makezscoretable = false; + private int probeDatasetPresenceThreshold = 0; + private int snpDatasetPresenceThreshold = 0; + private int probeAndSNPPresenceFilterSampleThreshold = 0; + private int runonlypermutation; + private int nrThresds; + private String probeselection; + private String snpselection; + private XMLConfiguration config; + private String snpprobeselection; + + public void parse(String settings, String texttoreplace, String replacetextwith) { + try { + config = new XMLConfiguration(settings); + + nrPermutations = config.getInt("defaults.permutations", 0); + + useAbsoluteZscore = config.getBoolean("defaults.absolutezscore", false); + finalEQTLBufferMaxLength = config.getInt("defaults.finalnreqtls", 100000); + fdrthreshold = config.getDouble("defaults.fdrthreshold", 0.05); + cisdistance = config.getInt("defaults.cisprobedistance", 250000); + transdistance = config.getInt("defaults.transprobedistance", 5000000); + includeProbesWithoutProperMapping = config.getBoolean("defaults.includeprobeswithoutmapping", true); + includeSNPsWithoutProperMapping = config.getBoolean("defaults.includesnpswithoutmapping", true); + makezscoreplot = config.getBoolean("defaults.makezscoreplot", true); + makezscoretable = config.getBoolean("defaults.makezscoretable", false); + probetranslationfile = config.getString("defaults.probetranslationfile"); + String outputStr = config.getString("defaults.output"); + + System.out.println("outputstr: " + outputStr); + + if (texttoreplace != null && replacetextwith != null && outputStr.contains(texttoreplace)) { + outputStr = outputStr.replaceAll(texttoreplace, replacetextwith); + System.out.println("outputstr: " + outputStr); + } + output = outputStr; + System.out.println("outputstr: " + outputStr); +// System.exit(-1); + + + probeDatasetPresenceThreshold = config.getInt("defaults.minimalnumberofdatasetsthatcontainprobe", 0); + snpDatasetPresenceThreshold = config.getInt("defaults.minimalnumberofdatasetsthatcontainsnp", 0); + probeAndSNPPresenceFilterSampleThreshold = config.getInt("defaults.snpprobeselectsamplesizethreshold", -1); + + runonlypermutation = config.getInt("defaults.runonlypermutation", -1); + nrThresds = config.getInt("defaults.threads", 0); + cis = config.getBoolean("defaults.cis", false); + trans = config.getBoolean("defaults.trans", false); + + probeselection = config.getString("defaults.probeselection"); + + if (probeselection != null && probeselection.trim().length() == 0) { + probeselection = null; + } + snpselection = config.getString("defaults.snpselection"); + + if (snpselection != null && snpselection.trim().length() == 0) { + snpselection = null; + } + + if (texttoreplace != null && replacetextwith != null && snpselection.contains(texttoreplace)) { + snpselection = snpselection.replaceAll(texttoreplace, replacetextwith); + } + + snpprobeselection = config.getString("defaults.snpprobeselection"); + + if (snpprobeselection != null && snpprobeselection.trim().length() == 0) { + snpprobeselection = null; + } else { + System.out.println("SNP PROBE SELECTION: " + snpprobeselection); + } + + + int i = 0; + + String dataset = ""; + datasetnames = new ArrayList(); + datasetlocations = new ArrayList(); + datasetannotations = new ArrayList(); + datasetPrefix = new ArrayList(); + + while (dataset != null) { + dataset = config.getString("datasets.dataset(" + i + ").name"); // see if a dataset is defined + if (dataset != null) { + + datasetnames.add(dataset); + String prefix = config.getString("datasets.dataset(" + i + ").prefix"); // see if a dataset is defined + + if (prefix == null) { + prefix = "Dataset"; + } + datasetPrefix.add(prefix); + String datasetlocation = config.getString("datasets.dataset(" + i + ").location"); // see if a dataset is defined + if (texttoreplace != null && replacetextwith != null && datasetlocation.contains(texttoreplace)) { + datasetlocation = datasetlocation.replace(texttoreplace, replacetextwith); + } + String datasetannotation = config.getString("datasets.dataset(" + i + ").expressionplatform"); // see if a dataset is defined + + datasetlocations.add(datasetlocation); + datasetannotations.add(datasetannotation); + } + i++; + } + + + // parse datasets + } catch (ConfigurationException e) { + e.printStackTrace(); + } + } + + /** + * @return the nrPermutations + */ + public int getNrPermutations() { + return nrPermutations; + } + + /** + * @param nrPermutations the nrPermutations to set + */ + public void setNrPermutations(int nrPermutations) { + this.nrPermutations = nrPermutations; + } + + /** + * @return the useAbsoluteZscore + */ + public boolean isUseAbsoluteZscore() { + return useAbsoluteZscore; + } + + /** + * @param useAbsoluteZscore the useAbsoluteZscore to set + */ + public void setUseAbsoluteZscore(boolean useAbsoluteZscore) { + this.useAbsoluteZscore = useAbsoluteZscore; + } + + /** + * @return the finalEQTLBufferMaxLength + */ + public int getFinalEQTLBufferMaxLength() { + return finalEQTLBufferMaxLength; + } + + /** + * @param finalEQTLBufferMaxLength the finalEQTLBufferMaxLength to set + */ + public void setFinalEQTLBufferMaxLength(int finalEQTLBufferMaxLength) { + this.finalEQTLBufferMaxLength = finalEQTLBufferMaxLength; + } + + /** + * @return the nrOfBins + */ + public int getNrOfBins() { + return nrOfBins; + } + + /** + * @param nrOfBins the nrOfBins to set + */ + public void setNrOfBins(int nrOfBins) { + this.nrOfBins = nrOfBins; + } + + /** + * @return the fdrthreshold + */ + public double getFdrthreshold() { + return fdrthreshold; + } + + /** + * @param fdrthreshold the fdrthreshold to set + */ + public void setFdrthreshold(double fdrthreshold) { + this.fdrthreshold = fdrthreshold; + } + + /** + * @return the includeSNPsWithoutProperMapping + */ + public boolean isIncludeSNPsWithoutProperMapping() { + return includeSNPsWithoutProperMapping; + } + + /** + * @param includeSNPsWithoutProperMapping the + * includeSNPsWithoutProperMapping to set + */ + public void setIncludeSNPsWithoutProperMapping(boolean includeSNPsWithoutProperMapping) { + this.includeSNPsWithoutProperMapping = includeSNPsWithoutProperMapping; + } + + /** + * @return the includeProbesWithoutProperMapping + */ + public boolean isIncludeProbesWithoutProperMapping() { + return includeProbesWithoutProperMapping; + } + + /** + * @param includeProbesWithoutProperMapping the + * includeProbesWithoutProperMapping to set + */ + public void setIncludeProbesWithoutProperMapping(boolean includeProbesWithoutProperMapping) { + this.includeProbesWithoutProperMapping = includeProbesWithoutProperMapping; + } + + /** + * @return the cis + */ + public boolean isCis() { + return cis; + } + + /** + * @param cis the cis to set + */ + public void setCis(boolean cis) { + this.cis = cis; + } + + /** + * @return the trans + */ + public boolean isTrans() { + return trans; + } + + /** + * @param trans the trans to set + */ + public void setTrans(boolean trans) { + this.trans = trans; + } + + /** + * @return the cisdistance + */ + public int getCisdistance() { + return cisdistance; + } + + /** + * @param cisdistance the cisdistance to set + */ + public void setCisdistance(int cisdistance) { + this.cisdistance = cisdistance; + } + + /** + * @return the transdistance + */ + public int getTransdistance() { + return transdistance; + } + + /** + * @param transdistance the transdistance to set + */ + public void setTransdistance(int transdistance) { + this.transdistance = transdistance; + } + + /** + * @return the makezscoreplot + */ + public boolean isMakezscoreplot() { + return makezscoreplot; + } + + /** + * @param makezscoreplot the makezscoreplot to set + */ + public void setMakezscoreplot(boolean makezscoreplot) { + this.makezscoreplot = makezscoreplot; + } + + /** + * @return the probetranslationfile + */ + public String getProbetranslationfile() { + return probetranslationfile; + } + + /** + * @param probetranslationfile the probetranslationfile to set + */ + public void setProbetranslationfile(String probetranslationfile) { + this.probetranslationfile = probetranslationfile; + } + + /** + * @return the datasetnames + */ + public ArrayList getDatasetnames() { + return datasetnames; + } + + /** + * @param datasetnames the datasetnames to set + */ + public void setDatasetnames(ArrayList datasetnames) { + this.datasetnames = datasetnames; + } + + /** + * @return the datasetlocations + */ + public ArrayList getDatasetlocations() { + return datasetlocations; + } + + /** + * @param datasetlocations the datasetlocations to set + */ + public void setDatasetlocations(ArrayList datasetlocations) { + this.datasetlocations = datasetlocations; + } + + /** + * @return the datasetannotations + */ + public ArrayList getDatasetannotations() { + return datasetannotations; + } + + /** + * @param datasetannotations the datasetannotations to set + */ + public void setDatasetannotations(ArrayList datasetannotations) { + this.datasetannotations = datasetannotations; + } + + /** + * @return the output + */ + public String getOutput() { + return output; + } + + /** + * @param output the output to set + */ + public void setOutput(String output) { + this.output = output; + } + + /** + * @return the makezscoretable + */ + public boolean isMakezscoretable() { + return makezscoretable; + } + + /** + * @param makezscoretable the makezscoretable to set + */ + public void setMakezscoretable(boolean makezscoretable) { + this.makezscoretable = makezscoretable; + } + + /** + * @return the probeDatasetPresenceThreshold + */ + public int getProbeDatasetPresenceThreshold() { + return probeDatasetPresenceThreshold; + } + + /** + * @param probeDatasetPresenceThreshold the probeDatasetPresenceThreshold to + * set + */ + public void setProbeDatasetPresenceThreshold(int probeDatasetPresenceThreshold) { + this.probeDatasetPresenceThreshold = probeDatasetPresenceThreshold; + } + + /** + * @return the snpDatasetPresenceThreshold + */ + public int getSnpDatasetPresenceThreshold() { + return snpDatasetPresenceThreshold; + } + + /** + * @param snpDatasetPresenceThreshold the snpDatasetPresenceThreshold to set + */ + public void setSnpDatasetPresenceThreshold(int snpDatasetPresenceThreshold) { + this.snpDatasetPresenceThreshold = snpDatasetPresenceThreshold; + } + + /** + * @return the probeAndSNPPresenceFilterSampleThreshold + */ + public int getProbeAndSNPPresenceFilterSampleThreshold() { + return probeAndSNPPresenceFilterSampleThreshold; + } + + /** + * @param probeAndSNPPresenceFilterSampleThreshold the + * probeAndSNPPresenceFilterSampleThreshold to set + */ + public void setProbeAndSNPPresenceFilterSampleThreshold(int probeAndSNPPresenceFilterSampleThreshold) { + this.probeAndSNPPresenceFilterSampleThreshold = probeAndSNPPresenceFilterSampleThreshold; + } + + /** + * @return the runonlypermutation + */ + public int getRunonlypermutation() { + return runonlypermutation; + } + + /** + * @param runonlypermutation the runonlypermutation to set + */ + public void setRunonlypermutation(int runonlypermutation) { + this.runonlypermutation = runonlypermutation; + } + + /** + * @return the nrThresds + */ + public int getNrThresds() { + return nrThresds; + } + + /** + * @param nrThresds the nrThresds to set + */ + public void setNrThresds(int nrThresds) { + this.nrThresds = nrThresds; + } + + ArrayList getDatasetPrefix() { + return datasetPrefix; + } + + /** + * @return the probeselection + */ + public String getProbeselection() { + return probeselection; + } + + /** + * @param probeselection the probeselection to set + */ + public void setProbeselection(String probeselection) { + this.probeselection = probeselection; + } + + public String getSNPSelection() { + return snpselection; + } + + public String getSNPProbeSelection() { + return snpprobeselection; + } + + void save() { + try { + config.save(output + "metasettings.xml"); + } catch (ConfigurationException ex) { + Logger.getLogger(MetaSettings.class.getName()).log(Level.SEVERE, null, ex); + } + + } } /* diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/cis/BinaryUnzipTask.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/cis/BinaryUnzipTask.java index 623a66e09..f2d3bd2b3 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/cis/BinaryUnzipTask.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/cis/BinaryUnzipTask.java @@ -4,103 +4,103 @@ */ package eqtlmappingpipeline.binarymeta.meta.cis; +import umcg.genetica.containers.Pair; +import umcg.genetica.io.trityper.bin.BinaryResultDataset; +import umcg.genetica.io.trityper.bin.BinaryResultSNP; + import java.nio.ByteBuffer; import java.util.HashMap; import java.util.concurrent.Callable; import java.util.zip.DataFormatException; import java.util.zip.Inflater; -import umcg.genetica.containers.Pair; -import umcg.genetica.io.trityper.bin.BinaryResultDataset; -import umcg.genetica.io.trityper.bin.BinaryResultSNP; /** - * * @author harm-jan */ public class BinaryUnzipTask implements Callable>> { - private final int snp; - private BinaryResultDataset data; - private final int numprobes; - private final Inflater inflater = new Inflater(); - private boolean poison; - - public BinaryUnzipTask(int snp, BinaryResultDataset data, int numprobes) { - this.snp = snp; - this.data = data; - this.numprobes = numprobes; - } - - BinaryUnzipTask(int snp, int nrProbes, BinaryResultDataset dataset, BinaryResultSNP[] snps) { - throw new UnsupportedOperationException("Not yet implemented"); - } - - @Override - public Pair> call() throws Exception { - if (snp < 0) { - return new Pair>(-1, null); - } - BinaryResultSNP[] snps = data.getSnps(); - BinaryResultSNP snpObject = snps[snp]; - long pointer = snpObject.getzScoreIndex(); - long nextpointer = -1; - - if (snp + 1 < snps.length) { - BinaryResultSNP snpObject2 = snps[snp + 1]; - nextpointer = snpObject2.getzScoreIndex(); - } - - byte[] bindata = data.getMatrix().readDeflated(pointer, nextpointer, data.getNumProbes()); - HashMap dataUnzipped = inflate(bindata, data.getNumProbes()); - bindata = null; + private final int snp; + private BinaryResultDataset data; + private final int numprobes; + private final Inflater inflater = new Inflater(); + private boolean poison; - return new Pair>(snp, dataUnzipped); - } + public BinaryUnzipTask(int snp, BinaryResultDataset data, int numprobes) { + this.snp = snp; + this.data = data; + this.numprobes = numprobes; + } - private HashMap inflate(byte[] buffer, int numElems) throws DataFormatException { - inflater.setInput(buffer); - inflater.finished(); - byte[] decompressed = new byte[numElems * 4]; - inflater.inflate(decompressed); + BinaryUnzipTask(int snp, int nrProbes, BinaryResultDataset dataset, BinaryResultSNP[] snps) { + throw new UnsupportedOperationException("Not yet implemented"); + } - long actuallydecompressed = inflater.getBytesWritten(); - if (actuallydecompressed != numElems * 4) { - throw new DataFormatException("IO Error: uncompressed data does not correspond to the size requested\t" + actuallydecompressed + "\t" + numElems * 4); + @Override + public Pair> call() throws Exception { + if (snp < 0) { + return new Pair>(-1, null); + } + BinaryResultSNP[] snps = data.getSnps(); + BinaryResultSNP snpObject = snps[snp]; + long pointer = snpObject.getzScoreIndex(); + long nextpointer = -1; + + if (snp + 1 < snps.length) { + BinaryResultSNP snpObject2 = snps[snp + 1]; + nextpointer = snpObject2.getzScoreIndex(); + } + + byte[] bindata = data.getMatrix().readDeflated(pointer, nextpointer, data.getNumProbes()); + HashMap dataUnzipped = inflate(bindata, data.getNumProbes()); + bindata = null; + + return new Pair>(snp, dataUnzipped); } - inflater.reset(); - - ByteBuffer bytebuffer = ByteBuffer.wrap(decompressed); - Float[] output = new Float[numElems]; - int ctr = 0; - HashMap results = new HashMap(); - for (int i = 0; i < numElems; i++) { - Float f = bytebuffer.getFloat(); - if (f.isNaN()) { - f = null; - } else { - ctr++; - results.put(i, f); - } + private HashMap inflate(byte[] buffer, int numElems) throws DataFormatException { + inflater.setInput(buffer); + inflater.finished(); + byte[] decompressed = new byte[numElems * 4]; + inflater.inflate(decompressed); + + long actuallydecompressed = inflater.getBytesWritten(); + if (actuallydecompressed != numElems * 4) { + throw new DataFormatException("IO Error: uncompressed data does not correspond to the size requested\t" + actuallydecompressed + "\t" + numElems * 4); + } + + inflater.reset(); + + ByteBuffer bytebuffer = ByteBuffer.wrap(decompressed); + Float[] output = new Float[numElems]; + int ctr = 0; + HashMap results = new HashMap(); + for (int i = 0; i < numElems; i++) { + Float f = bytebuffer.getFloat(); + if (f.isNaN()) { + f = null; + } else { + ctr++; + results.put(i, f); + } // output[i] = f; - } + } - decompressed = null; - buffer = null; + decompressed = null; + buffer = null; - if (ctr == 0) { - return null; - } else { - return results; + if (ctr == 0) { + return null; + } else { + return results; + } } - } - void setIsPoison() { - poison = true; - } + void setIsPoison() { + poison = true; + } - boolean isPoison() { - return poison; - } + boolean isPoison() { + return poison; + } } diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/cis/CisAnalysis.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/cis/CisAnalysis.java index 538175b4e..c96e96d28 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/cis/CisAnalysis.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/cis/CisAnalysis.java @@ -4,7 +4,7 @@ */ package eqtlmappingpipeline.binarymeta.meta.cis; -import com.lowagie.text.DocumentException; +import com.itextpdf.text.DocumentException; import eqtlmappingpipeline.metaqtl3.FDR; import eqtlmappingpipeline.metaqtl3.graphics.EQTLDotPlot; import eqtlmappingpipeline.binarymeta.meta.MetaAnalyze; diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/graphics/ZScorePlot.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/graphics/ZScorePlot.java index 3b1ac4b5f..140b91105 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/graphics/ZScorePlot.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/meta/graphics/ZScorePlot.java @@ -4,8 +4,8 @@ */ package eqtlmappingpipeline.binarymeta.meta.graphics; -import com.lowagie.text.Document; -import com.lowagie.text.pdf.PdfContentByte; +import com.itextpdf.text.Document; +import com.itextpdf.text.pdf.PdfContentByte; import java.awt.Color; import java.awt.Font; import java.awt.Graphics2D; @@ -37,7 +37,7 @@ public class ZScorePlot { private String outfilename = ""; private Document document; private PdfContentByte cb; - private com.lowagie.text.pdf.PdfWriter writer; + private com.itextpdf.text.pdf.PdfWriter writer; public void init(int numdatasets, String[] datasets, boolean pdf, String filename) { @@ -58,11 +58,11 @@ public void init(int numdatasets, String[] datasets, boolean pdf, String filenam height = (plotsize * numDatasets) + ((numdatasets + 1) * spacer) - (plotsize + spacer); if (pdfOutput) { - com.lowagie.text.Rectangle rectangle = new com.lowagie.text.Rectangle(width, height); - document = new com.lowagie.text.Document(rectangle); + com.itextpdf.text.Rectangle rectangle = new com.itextpdf.text.Rectangle(width, height); + document = new com.itextpdf.text.Document(rectangle); writer = null; try { - writer = com.lowagie.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(filename)); + writer = com.itextpdf.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(filename)); document.open(); cb = writer.getDirectContent(); cb.saveState(); diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/util/Filter.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/util/Filter.java index ed2f3f9bb..628fa188f 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/util/Filter.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/binarymeta/util/Filter.java @@ -4,7 +4,7 @@ */ package eqtlmappingpipeline.binarymeta.util; -import com.lowagie.text.DocumentException; +import com.itextpdf.text.DocumentException; import eqtlmappingpipeline.metaqtl3.FDR; import eqtlmappingpipeline.metaqtl3.FDR.FDRMethod; import eqtlmappingpipeline.metaqtl3.graphics.EQTLDotPlot; diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/graphics/Graphics.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/graphics/Graphics.java index 1f21e3200..0cfc0972d 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/graphics/Graphics.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/graphics/Graphics.java @@ -18,7 +18,7 @@ */ public class Graphics { - private com.lowagie.text.Document document; + private com.itextpdf.text.Document document; private boolean usePDF = false; protected BufferedImage bi; protected Graphics2D g2d; @@ -29,9 +29,9 @@ public class Graphics { protected int marginTop, marginBottom, marginLeft, marginRight; protected double scalingX, scalingY; protected int FILE_TYPE; - protected com.lowagie.text.pdf.PdfContentByte cb; + protected com.itextpdf.text.pdf.PdfContentByte cb; protected String outputLoc = ""; - protected com.lowagie.text.pdf.PdfWriter writer; + protected com.itextpdf.text.pdf.PdfWriter writer; public Graphics() { bi = new java.awt.image.BufferedImage(100, 100, java.awt.image.BufferedImage.TYPE_INT_RGB); @@ -51,11 +51,11 @@ public Graphics(int width, int height) { protected void init(int width, int height) { if (usePDF) { - com.lowagie.text.Rectangle rectangle = new com.lowagie.text.Rectangle(width, height); - document = new com.lowagie.text.Document(rectangle); + com.itextpdf.text.Rectangle rectangle = new com.itextpdf.text.Rectangle(width, height); + document = new com.itextpdf.text.Document(rectangle); writer = null; try { - writer = com.lowagie.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(outputLoc)); + writer = com.itextpdf.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(outputLoc)); document.open(); cb = writer.getDirectContent(); cb.saveState(); diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/gui/EQTLMappingPipelineConsole.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/gui/EQTLMappingPipelineConsole.java index 48494cd1f..ba6e06f2b 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/gui/EQTLMappingPipelineConsole.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/gui/EQTLMappingPipelineConsole.java @@ -11,6 +11,7 @@ import eqtlmappingpipeline.binaryInteraction.BinaryInteractionMetaAnalysis; import eqtlmappingpipeline.binaryInteraction.ConvertTextOutputToBinary; import eqtlmappingpipeline.binaryInteraction.CovariateImportance; +import eqtlmappingpipeline.binaryInteraction.InvestigateCovariate; import eqtlmappingpipeline.binaryInteraction.QueryBinaryInteraction; import eqtlmappingpipeline.binaryInteraction.ReplicateInteractions; import eqtlmappingpipeline.interactionanalysis.InteractionAnalysisConsoleGUI; @@ -131,6 +132,9 @@ public void main(String[] args) throws Exception { } else if (mode.equals("interactionChi2") || mode.equals("ic")) { CovariateImportance.main(Arrays.copyOfRange(args, 2, args.length)); return; + } else if (mode.equals("covariate")) { + InvestigateCovariate.main(Arrays.copyOfRange(args, 2, args.length)); + return; } else if (mode.equals("pileupToVcf")) { PileupToVcf.main(Arrays.copyOfRange(args, 2, args.length)); return; diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisConsoleGUI.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisConsoleGUI.java index 772e758d3..c742c3153 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisConsoleGUI.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisConsoleGUI.java @@ -4,253 +4,264 @@ */ package eqtlmappingpipeline.interactionanalysis; -import java.io.IOException; import umcg.genetica.console.ConsoleGUIElems; +import java.io.IOException; + /** - * * @author harm-jan */ public class InteractionAnalysisConsoleGUI { - enum RUNMODE { + enum RUNMODE { - NORMALIZE, CELLTYPESPECIFICEQTLMAPPING, PLOT - }; + NORMALIZE, CELLTYPESPECIFICEQTLMAPPING, PLOT + } - /** - * @param args the command line arguments - */ - public InteractionAnalysisConsoleGUI(String[] args) { - String inexpraw = null; - String out = null; - String celltypespecificprobefile = null; - String mdscomponents = null; - String cellcountfile = null; - String in = null; - String gte = null; - String snpprobecombofile = null; - String covariates = null; - String inexp = null; + ; + + /** + * @param args the command line arguments + */ + public InteractionAnalysisConsoleGUI(String[] args) { + String inexpraw = null; + String out = null; + String celltypespecificprobefile = null; + String mdscomponents = null; + String cellcountfile = null; + String in = null; + String gte = null; + String snpprobecombofile = null; + String covariates = null; + String inexp = null; String cohort = null; - RUNMODE step = null; - boolean binaryoutput = false; + RUNMODE step = null; + boolean binaryoutput = false; - boolean robust = false; - boolean fullStats = false; + boolean robust = false; + boolean forceNormal = false; + boolean fullStats = false; - boolean matchCovariateNamesToExpressionProbeNames = false; - Integer nrThreads = null; - String covariateList = null; + boolean matchCovariateNamesToExpressionProbeNames = false; + Integer nrThreads = null; + String covariateList = null; - for (int i = 0; i < args.length; i++) { - String arg = args[i]; - String val = null; + for (int i = 0; i < args.length; i++) { + String arg = args[i]; + String val = null; - if (i + 1 < args.length) { - val = args[i + 1]; - } + if (i + 1 < args.length) { + val = args[i + 1]; + } - if (arg.equals("--step")) { - if (val == null) { + if (arg.equals("--step")) { + if (val == null) { - } else if (val.equals("normalize")) { - step = RUNMODE.NORMALIZE; - } else if (val.equals("mapeqtls")) { - step = RUNMODE.CELLTYPESPECIFICEQTLMAPPING; - } else if (val.equals("plot")) { - step = RUNMODE.PLOT; - } - } else if (arg.equals("--inexpraw")) { - inexpraw = val; - } else if (arg.equals("--covariatelist")) { - covariateList = val; - } else if (arg.equals("--binary")) { - binaryoutput = true; - } else if (arg.equals("--robust")) { - System.out.println("WARNING: using R connection!! Make sure Rserve and sandwich are installed"); - robust = true; - } else if (arg.equals("--fullstats")) { - fullStats = true; - } else if (arg.equals("--covariates")) { - covariates = val; - } else if (arg.equals("--inexp")) { - inexp = val; - } else if (arg.equals("--out")) { - out = val; - } else if (arg.equals("--in")) { - in = val; - } else if (arg.equals("--celltypespecificprobes")) { - celltypespecificprobefile = val; - } else if (arg.equals("--mdscomponents")) { - mdscomponents = val; - } else if (arg.equals("--cellcounts")) { - cellcountfile = val; - } else if (arg.equals("--gte")) { - gte = val; - } else if (arg.equals("--snpprobe")) { - snpprobecombofile = val; + } else if (val.equals("normalize")) { + step = RUNMODE.NORMALIZE; + } else if (val.equals("mapeqtls")) { + step = RUNMODE.CELLTYPESPECIFICEQTLMAPPING; + } else if (val.equals("plot")) { + step = RUNMODE.PLOT; + } + } else if (arg.equals("--inexpraw")) { + inexpraw = val; + } else if (arg.equals("--covariatelist")) { + covariateList = val; + } else if (arg.equals("--binary")) { + binaryoutput = true; + } else if (arg.equals("--robust")) { + System.out.println("WARNING: using R connection!! Make sure Rserve and sandwich are installed"); + robust = true; + } else if (arg.equals("--forceNormal")) { + forceNormal = true; + } else if (arg.equals("--fullstats")) { + fullStats = true; + } else if (arg.equals("--covariates")) { + covariates = val; + } else if (arg.equals("--inexp")) { + inexp = val; + } else if (arg.equals("--out")) { + out = val; + } else if (arg.equals("--in")) { + in = val; + } else if (arg.equals("--celltypespecificprobes")) { + celltypespecificprobefile = val; + } else if (arg.equals("--mdscomponents")) { + mdscomponents = val; + } else if (arg.equals("--cellcounts")) { + cellcountfile = val; + } else if (arg.equals("--gte")) { + gte = val; + } else if (arg.equals("--snpprobe")) { + snpprobecombofile = val; } else if (arg.equals("--cohort")) { cohort = val; - } else if (arg.equals("--testMatchingCovariates")) { - matchCovariateNamesToExpressionProbeNames = true; - } else if (arg.equals("--threads")) { - try { - nrThreads = Integer.parseInt(val); - } catch (NumberFormatException e) { - System.err.println("ERROR: value supplied for --threads is not a numerical value."); - System.exit(-1); - } - if (nrThreads != null && nrThreads < 1) { - System.err.println("ERROR: value supplied for --threads is smaller than 1."); - System.exit(-1); - } + } else if (arg.equals("--testMatchingCovariates")) { + matchCovariateNamesToExpressionProbeNames = true; + } else if (arg.equals("--threads")) { + try { + nrThreads = Integer.parseInt(val); + } catch (NumberFormatException e) { + System.err.println("ERROR: value supplied for --threads is not a numerical value."); + System.exit(-1); + } + if (nrThreads != null && nrThreads < 1) { + System.err.println("ERROR: value supplied for --threads is smaller than 1."); + System.exit(-1); + } - } - } + } + } - if (step == null) { - System.err.println("ERROR: please select the step to run."); - printUsage(); - } + if (step == null) { + System.err.println("ERROR: please select the step to run."); + printUsage(); + } - try { - if (step == RUNMODE.PLOT) { - System.out.println("Interaction plotter"); - boolean kill = false; - if (covariates == null) { - System.err.println("Error: please supply --covariates"); - kill = true; - } - if (in == null) { - System.err.println("Error: please supply --in"); - kill = true; - } - if (inexp == null) { - System.err.println("Error: please supply --inexp"); - kill = true; - } - if (out == null) { - System.err.println("Error: please supply --out"); - kill = true; - } - if (kill) { - System.err.println(""); - printUsage(); - } else { - InteractionPlotter plotter = new InteractionPlotter(snpprobecombofile, in, inexp, covariates, gte, out); - } - } else { - InteractionAnalysisMultiThreaded qmt = new InteractionAnalysisMultiThreaded(); - if (step == RUNMODE.NORMALIZE) { - System.out.println("Cell type specific cis-eQTL normalization"); - boolean kill = false; - if (inexpraw == null) { - System.err.println("Error: please supply --inexpraw"); - kill = true; - } - if (out == null) { - System.err.println("Error: please supply --out"); - kill = true; - } - if (celltypespecificprobefile == null) { - System.err.println("Error: please supply --celltypespecificprobes"); - kill = true; - } - if (kill) { - System.err.println(""); - printUsage(); - } else { - qmt.prepareDataForCelltypeSpecificEQTLMapping(inexpraw, out, null, celltypespecificprobefile, mdscomponents, cellcountfile, gte, nrThreads); - } - } else if (step == RUNMODE.CELLTYPESPECIFICEQTLMAPPING) { - System.out.println("Cell type specific cis-eQTL mapping"); - boolean kill = false; - if (covariates == null) { - System.err.println("Error: please supply --covariates"); - kill = true; - } - if (inexp == null) { - System.err.println("Error: please supply --inexp"); - kill = true; - } - if (out == null) { - System.err.println("Error: please supply --out"); - kill = true; - } - if (cellcountfile == null) { + try { + if (step == RUNMODE.PLOT) { + System.out.println("Interaction plotter"); + boolean kill = false; + if (covariates == null) { + System.err.println("Error: please supply --covariates"); + kill = true; + } + if (in == null) { + System.err.println("Error: please supply --in"); + kill = true; + } + if (inexp == null) { + System.err.println("Error: please supply --inexp"); + kill = true; + } + if (out == null) { + System.err.println("Error: please supply --out"); + kill = true; + } + if (kill) { + System.err.println(""); + printUsage(); + } else { + InteractionPlotter plotter = new InteractionPlotter(snpprobecombofile, in, inexp, covariates, gte, out); + } + } else { + InteractionAnalysisMultiThreaded qmt = new InteractionAnalysisMultiThreaded(); + if (step == RUNMODE.NORMALIZE) { + System.out.println("Cell type specific cis-eQTL normalization"); + boolean kill = false; + if (inexpraw == null) { + System.err.println("Error: please supply --inexpraw"); + kill = true; + } + if (out == null) { + System.err.println("Error: please supply --out"); + kill = true; + } + if (celltypespecificprobefile == null) { + System.err.println("Error: please supply --celltypespecificprobes"); + kill = true; + } + if (kill) { + System.err.println(""); + printUsage(); + } else { + qmt.prepareDataForCelltypeSpecificEQTLMapping(inexpraw, out, null, celltypespecificprobefile, mdscomponents, cellcountfile, gte, nrThreads); + } + } else if (step == RUNMODE.CELLTYPESPECIFICEQTLMAPPING) { + System.out.println("Cell type specific cis-eQTL mapping"); + boolean kill = false; + if (covariates == null) { + System.err.println("Error: please supply --covariates"); + kill = true; + } + if (inexp == null) { + System.err.println("Error: please supply --inexp"); + kill = true; + } + if (out == null) { + System.err.println("Error: please supply --out"); + kill = true; + } + if (cellcountfile == null) { // System.err.println("Warning: yo please supply --cellcounts"); - //kill = true; - } + //kill = true; + } if ((binaryoutput == true) && (cohort == null)) { System.err.println("Error: please supply --cohort (required in binary output mode)"); kill = true; } - if (kill) { - System.err.println(""); - printUsage(); - } else { - qmt.runInteractionAnalysis(inexp, - covariates, - in, - gte, - snpprobecombofile, - nrThreads, - out, - covariateList, robust, fullStats, binaryoutput, cohort); -// qmt.runCelltypeSpecificEQTLMapping(inexppccorrected, inexpraw, in, gte, snpprobecombofile, cellcountfile, nrThreads, out, testAllCovariatesInCovariateData); - } + if (kill) { + System.err.println(""); + printUsage(); + } else { + qmt.runInteractionAnalysis(inexp, + covariates, + in, + gte, + snpprobecombofile, + nrThreads, + out, + covariateList, + forceNormal, + robust, + fullStats, + binaryoutput, + cohort); + + } - } - } - } catch (IOException e) { - e.printStackTrace(); - } catch (Exception e) { - e.printStackTrace(); - } - } + } + } + } catch (IOException e) { + e.printStackTrace(); + } catch (Exception e) { + e.printStackTrace(); + } + } - private void printUsage() { - System.out.print("\nCell type specific eQTL Mapping\n" + ConsoleGUIElems.LINE); - System.out.println("This program uses an OLS model to test eQTLs for cell type specificity."); + private void printUsage() { + System.out.print("\nCell type specific eQTL Mapping\n" + ConsoleGUIElems.LINE); + System.out.println("This program uses an OLS model to test eQTLs for cell type specificity."); - System.out.println(""); - System.out.print("Step 1: Normalization\n" + ConsoleGUIElems.LINE); - System.out.println("--step normalize\t\t\t\tTell the program to run normalization.\n" - + "--inexpraw\t\t\tdir\t\tLocation of the gene expression data\n" - + "--out\t\t\t\tdir\t\tLocation where the output should be stored\n" - + "--celltypespecificprobes\tString\t\tLocation of the file containing list of cell-type specific probes\n" - + "--mdscomponents\t\t\tString\t\tLocation of the file containing MDS components (optional)\n" - + "--gte\t\t\t\tString\t\tLocation of the genotype to expression coupling file (optional)\n" - + "--cellcounts\t\t\tString\t\tLocation of the cell count file (optional)\n"); + System.out.println(""); + System.out.print("Step 1: Normalization\n" + ConsoleGUIElems.LINE); + System.out.println("--step normalize\t\t\t\tTell the program to run normalization.\n" + + "--inexpraw\t\t\tdir\t\tLocation of the gene expression data\n" + + "--out\t\t\t\tdir\t\tLocation where the output should be stored\n" + + "--celltypespecificprobes\tString\t\tLocation of the file containing list of cell-type specific probes\n" + + "--mdscomponents\t\t\tString\t\tLocation of the file containing MDS components (optional)\n" + + "--gte\t\t\t\tString\t\tLocation of the genotype to expression coupling file (optional)\n" + + "--cellcounts\t\t\tString\t\tLocation of the cell count file (optional)\n"); - System.out.println(""); + System.out.println(""); - System.out.print("Step 2: Mapping eQTLs with interaction model\n" + ConsoleGUIElems.LINE); - System.out.println("--step mapeqtls\t\t\t\tTell the program to map eQTLs.\n" - + "--inexp\tdir\t\tLocation of the dependent dataset\n" - + "--covariates\t\tdir\t\tLocation of covariate file (may contain one or more covariates)\n" - + "--gte\t\t\tString\t\tLocation of the genotype to expression coupling file\n" - + "--in\t\t\tdir\t\tLocation of the genotype data\n" - + "--out\t\t\tdir\t\tLocation where the output should be stored\n" - + "--snpprobe\t\tString\t\tLocation of the SNP-Probe combination file\n" - + "--threads\t\tInteger\t\tThe number of threads to use for calculations.\n" - + "--covariatelist\t\tList of covariates to test\n" - + "--robust\t\tUse robust estimates of standard errors (Requires Rserve and sandwich packages, and R)\n" - + "--fullstats\t\tOutput extra columns of statistics (SEs and Betas)"); + System.out.print("Step 2: Mapping eQTLs with interaction model\n" + ConsoleGUIElems.LINE); + System.out.println("--step mapeqtls\t\t\t\tTell the program to map eQTLs.\n" + + "--inexp\tdir\t\tLocation of the dependent dataset\n" + + "--covariates\t\tdir\t\tLocation of covariate file (may contain one or more covariates)\n" + + "--gte\t\t\tString\t\tLocation of the genotype to expression coupling file\n" + + "--in\t\t\tdir\t\tLocation of the genotype data\n" + + "--out\t\t\tdir\t\tLocation where the output should be stored\n" + + "--snpprobe\t\tString\t\tLocation of the SNP-Probe combination file\n" + + "--threads\t\tInteger\t\tThe number of threads to use for calculations.\n" + + "--covariatelist\t\tList of covariates to test\n" + + "--robust\t\tUse robust estimates of standard errors (Requires Rserve and sandwich packages, and R)\n" + + "--forceNormal\t\tForce a normal distribution on the covariate and gene expression data.\n" + + "--fullstats\t\tOutput extra columns of statistics (SEs and Betas)"); - System.out.println(""); + System.out.println(""); - System.out.print("Step 3: Plot effects\n" + ConsoleGUIElems.LINE); - System.out.println("--step plot\t\t\t\tTell the program to plot interaction effects.\n" - + "--inexp\tdir\t\tLocation of the dependent dataset\n" - + "--covariates\t\tdir\t\tLocation of covariate file (the raw gene expression data or the matrix containing the covariates to analyze)\n" - + "--gte\t\t\tString\t\tLocation of the genotype to expression coupling file\n" - + "--in\t\t\tdir\t\tLocation of the genotype data\n" - + "--out\t\t\tdir\t\tLocation where the output should be stored\n" - + "--snpprobe\t\tString\t\tLocation of the SNP-Covariate-Probe combination file\n" - ); + System.out.print("Step 3: Plot effects\n" + ConsoleGUIElems.LINE); + System.out.println("--step plot\t\t\t\tTell the program to plot interaction effects.\n" + + "--inexp\tdir\t\tLocation of the dependent dataset\n" + + "--covariates\t\tdir\t\tLocation of covariate file (the raw gene expression data or the matrix containing the covariates to analyze)\n" + + "--gte\t\t\tString\t\tLocation of the genotype to expression coupling file\n" + + "--in\t\t\tdir\t\tLocation of the genotype data\n" + + "--out\t\t\tdir\t\tLocation where the output should be stored\n" + + "--snpprobe\t\tString\t\tLocation of the SNP-Covariate-Probe combination file\n" + ); - } + } } diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisDetermineDirection.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisDetermineDirection.java new file mode 100644 index 000000000..3b4f520c6 --- /dev/null +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisDetermineDirection.java @@ -0,0 +1,420 @@ +package eqtlmappingpipeline.interactionanalysis; + +import au.com.bytecode.opencsv.CSVReader; +import au.com.bytecode.opencsv.CSVWriter; +import gnu.trove.list.array.TDoubleArrayList; +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.Map; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import org.apache.commons.collections4.BidiMap; +import org.apache.commons.collections4.bidimap.DualHashBidiMap; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.math3.random.Well19937c; +import org.apache.commons.math3.stat.correlation.SpearmansCorrelation; +import org.apache.commons.math3.stat.ranking.NaNStrategy; +import org.apache.commons.math3.stat.ranking.NaturalRanking; +import org.apache.commons.math3.stat.ranking.RankingAlgorithm; +import org.apache.log4j.Logger; +import org.molgenis.genotype.Allele; +import org.molgenis.genotype.Alleles; +import org.molgenis.genotype.GenotypeDataException; +import org.molgenis.genotype.GenotypeInfo; +import org.molgenis.genotype.RandomAccessGenotypeData; +import org.molgenis.genotype.RandomAccessGenotypeDataReaderFormats; +import org.molgenis.genotype.multipart.IncompatibleMultiPartGenotypeDataException; +import org.molgenis.genotype.tabix.TabixFileNotFoundException; +import org.molgenis.genotype.variant.GeneticVariant; +import umcg.genetica.math.matrix2.DoubleMatrixDataset; + +/** + * + * @author Patrick Deelen + */ +public class InteractionAnalysisDetermineDirection { + + private final RandomAccessGenotypeData genotypeData; + private final DoubleMatrixDataset expressionData; + private final DoubleMatrixDataset covariatesData; + private final BidiMap gte; + private final HashMap variantIdMap; + private static final RankingAlgorithm COV_RANKER = new NaturalRanking(NaNStrategy.FAILED, new Well19937c(1)); + private static final SpearmansCorrelation spearmanCalculator = new SpearmansCorrelation(); + private static final Options OPTIONS; + private static Logger LOGGER; + + static { + + LOGGER = Logger.getLogger(GenotypeInfo.class); + + OPTIONS = new Options(); + + OptionBuilder.withArgName("basePath"); + OptionBuilder.hasArgs(); + OptionBuilder.withDescription("The genotype"); + OptionBuilder.withLongOpt("genotypes"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("g")); + + OptionBuilder.withArgName("format"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("The genotype data format. If not defined will attempt to automatically select the first matching dataset on the specified path\n" + + "* PED_MAP - plink PED MAP files.\n" + + "* PLINK_BED - plink BED BIM FAM files.\n" + + "* VCF - bgziped vcf with tabix index file\n" + + "* VCFFOLDER - matches all bgziped vcf files + tabix index in a folder\n" + + "* SHAPEIT2 - shapeit2 phased haplotypes .haps & .sample\n" + + "* GEN - Oxford .gen & .sample\n" + + "* TRITYPER - TriTyper format folder"); + OptionBuilder.withLongOpt("genotypesFormat"); + OPTIONS.addOption(OptionBuilder.create("G")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Expression data"); + OptionBuilder.withLongOpt("expression"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("e")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Covariate data"); + OptionBuilder.withLongOpt("covariates"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("c")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Genotype to expression coupling"); + OptionBuilder.withLongOpt("gte"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("gte")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Query variant gene covariate assessedAllele. No header"); + OptionBuilder.withLongOpt("query"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("q")); + + OptionBuilder.withArgName("path"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Output file"); + OptionBuilder.withLongOpt("output"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("o")); + + OptionBuilder.withArgName("double"); + OptionBuilder.hasArg(); + OptionBuilder.withDescription("Fraction of tail of either end of covarate to use."); + OptionBuilder.withLongOpt("fraction"); + OptionBuilder.isRequired(); + OPTIONS.addOption(OptionBuilder.create("f")); + + } + + public static void main(String[] args) throws IOException { + + CommandLineParser parser = new PosixParser(); + final CommandLine commandLine; + try { + commandLine = parser.parse(OPTIONS, args, false); + } catch (ParseException ex) { + System.err.println("Invalid command line arguments: " + ex.getMessage()); + System.err.println(); + new HelpFormatter().printHelp(" ", OPTIONS); + System.exit(1); + return; + } + + final String[] genotypePath = commandLine.getOptionValues("g"); + final RandomAccessGenotypeDataReaderFormats genotypeFormat; + + try { + if (commandLine.hasOption("G")) { + genotypeFormat = RandomAccessGenotypeDataReaderFormats.valueOf(commandLine.getOptionValue("G").toUpperCase()); + } else { + if (genotypePath[0].endsWith(".vcf")) { + System.err.println("Only vcf.gz is supported. Please see manual on how to do create a vcf.gz file."); + System.exit(1); + return; + } + try { + genotypeFormat = RandomAccessGenotypeDataReaderFormats.matchFormatToPath(genotypePath[0]); + } catch (GenotypeDataException e) { + System.err.println("Unable to determine input 1 type based on specified path. Please specify --G"); + System.exit(1); + return; + } + } + } catch (IllegalArgumentException e) { + System.err.println("Error parsing --G \"" + commandLine.getOptionValue("G") + "\" is not a valid input data format"); + System.exit(1); + return; + } + + final String expressionDataPath = commandLine.getOptionValue("e"); + final String covariateDataPath = commandLine.getOptionValue("c"); + final String gtePath = commandLine.getOptionValue("gte"); + final String queryPath = commandLine.getOptionValue("q"); + final String outputPath = commandLine.getOptionValue("o"); + final double fractionToUse = Double.parseDouble(commandLine.getOptionValue("f")); + + System.out.println("Genotype data: " + genotypePath); + System.out.println("Genotype data format: " + genotypeFormat); + System.out.println("Expression data: " + expressionDataPath); + System.out.println("Covariate data: " + covariateDataPath); + System.out.println("Gte data: " + gtePath); + System.out.println("Query: " + queryPath); + System.out.println("Output: " + outputPath); + System.out.println("Outer fractions to use: " + fractionToUse); + + final RandomAccessGenotypeData genotypeData; + + try { + genotypeData = genotypeFormat.createFilteredGenotypeData(genotypePath, 100, null, null, null, 0.8); + } catch (TabixFileNotFoundException e) { + LOGGER.fatal("Tabix file not found for input data at: " + e.getPath() + "\n" + + "Please see README on how to create a tabix file"); + System.exit(1); + return; + } catch (IOException e) { + LOGGER.fatal("Error reading input data: " + e.getMessage(), e); + System.exit(1); + return; + } catch (IncompatibleMultiPartGenotypeDataException e) { + LOGGER.fatal("Error combining the impute genotype data files: " + e.getMessage(), e); + System.exit(1); + return; + } catch (GenotypeDataException e) { + LOGGER.fatal("Error reading input data: " + e.getMessage(), e); + System.exit(1); + return; + } + + System.out.println("Genotype data loaded for " + genotypeData.getSampleNames().length + " individuals"); + + final DoubleMatrixDataset expressionData = DoubleMatrixDataset.loadDoubleTextData(expressionDataPath, "\t"); + + System.out.println("Loaded expression data for: " + expressionData.rows() + " genes and " + expressionData.columns() + " individuals"); + + final DoubleMatrixDataset covariatesData = DoubleMatrixDataset.loadDoubleTextData(covariateDataPath, "\t"); + + System.out.println("Loaded covariate data for: " + expressionData.rows() + " genes and " + expressionData.columns() + " individuals"); + + final BidiMap gte = loadGte(gtePath); + + InteractionAnalysisDetermineDirection directionTool = new InteractionAnalysisDetermineDirection(genotypeData, expressionData, covariatesData, gte); + + CSVReader reader = new CSVReader(new FileReader(queryPath), '\t', '\0', 1); + CSVWriter writer = new CSVWriter(new FileWriter(outputPath), '\t', CSVWriter.NO_QUOTE_CHARACTER); + + String[] outputLine = new String[6]; + int c = 0; + outputLine[c++] = "variant"; + outputLine[c++] = "gene"; + outputLine[c++] = "covariate"; + outputLine[c++] = "assessedAllele"; + outputLine[c++] = "rhoLow"; + outputLine[c++] = "rhoHigh"; + writer.writeNext(outputLine); + + String[] nextLine; + while ((nextLine = reader.readNext()) != null) { + + final String variant = nextLine[0]; + final String gene = nextLine[1]; + final String covariate = nextLine[2]; + final Allele assessedAllele = Allele.create(nextLine[3]); + + final EffectDiffResult effectDiff = directionTool.calculateEffectDifference(variant, gene, covariate, assessedAllele, fractionToUse); + + c = 0; + outputLine[c++] = variant; + outputLine[c++] = gene; + outputLine[c++] = covariate; + outputLine[c++] = assessedAllele.getAlleleAsString(); + outputLine[c++] = String.valueOf(effectDiff.getRhoLow()); + outputLine[c++] = String.valueOf(effectDiff.getRhoHigh()); + writer.writeNext(outputLine); + + } + writer.close(); + reader.close(); + System.out.println("Done"); + + } + + private static BidiMap loadGte(String gtePath) throws IOException { + + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(gtePath), "UTF-8")); + + String line; + + BidiMap gte = new DualHashBidiMap(); + + while ((line = reader.readLine()) != null) { + String[] elements = StringUtils.split(line, '\t'); + if (elements.length != 2) { + throw new RuntimeException("Error in GTE file line: " + line); + } + gte.put(elements[0], elements[1]); + } + + return gte; + } + + public InteractionAnalysisDetermineDirection(RandomAccessGenotypeData genotypeData, DoubleMatrixDataset expressionData, DoubleMatrixDataset covariatesData, BidiMap gte) { + this.genotypeData = genotypeData; + this.expressionData = expressionData; + this.covariatesData = covariatesData; + this.gte = gte; + this.variantIdMap = genotypeData.getVariantIdMap(); + + HashSet genotypedSamples = new HashSet(); + Collections.addAll(genotypedSamples, genotypeData.getSampleNames()); + + for (Iterator> it = gte.entrySet().iterator(); it.hasNext();) { + Map.Entry gteEntry = it.next(); + + if (!genotypedSamples.contains(gteEntry.getKey())) { + it.remove(); + } + + if (!expressionData.containsCol(gteEntry.getValue())) { + it.remove(); + } + + if (!covariatesData.containsCol(gteEntry.getValue())) { + it.remove(); + } + + } + + System.out.println("Samples with: genotypes, expression & covariate data: " + gte.size()); + + } + + public EffectDiffResult calculateEffectDifference(String snpId, String geneName, String covariateName, Allele assessedAllele, double fractionOfSamplesPerGroup) { + + if (!variantIdMap.containsKey(snpId)) { + return new EffectDiffResult(Double.NaN, Double.NaN); + } + + if (!expressionData.containsRow(geneName)) { + return new EffectDiffResult(Double.NaN, Double.NaN); + } + + if (!covariatesData.containsRow(covariateName)) { + return new EffectDiffResult(Double.NaN, Double.NaN); + } + + if (fractionOfSamplesPerGroup <= 0 || fractionOfSamplesPerGroup >= 1) { + throw new RuntimeException("Fraction must be between 0 and 1"); + } + + GeneticVariant variant = variantIdMap.get(snpId); + Alleles variantAlleles = variant.getVariantAlleles(); + + if (!variantAlleles.contains(assessedAllele)) { + return new EffectDiffResult(Double.NaN, Double.NaN); + } + + if (variantAlleles.getAlleleCount() != 2) { + return new EffectDiffResult(Double.NaN, Double.NaN); + } + + float[] dosagesAll = variant.getSampleDosages(); + String[] genotypedSamples = genotypeData.getSampleNames(); + + LinkedHashSet includedGenotypedSamples = new LinkedHashSet<>(); + TDoubleArrayList dosages = new TDoubleArrayList(dosagesAll.length); + + for (int i = 0; i < dosagesAll.length; ++i) { + if (dosagesAll[i] >= 0 && gte.containsKey(genotypedSamples[i])) { + includedGenotypedSamples.add(genotypedSamples[i]); + dosages.add(dosagesAll[i]); + } + } + + System.out.println("Included samples: " + includedGenotypedSamples.size()); + + double[] expressionLevels = new double[includedGenotypedSamples.size()]; + double[] covariateLevels = new double[includedGenotypedSamples.size()]; + + int s = 0; + for (String genotypeSample : includedGenotypedSamples) { + expressionLevels[s] = expressionData.getElement(geneName, gte.get(genotypeSample)); + covariateLevels[s] = covariatesData.getElement(covariateName, gte.get(genotypeSample)); + ++s; + } + + if (assessedAllele != variantAlleles.get(0)) { + for (int i = 0; i < dosages.size(); ++i) { + dosages.setQuick(i, dosages.getQuick(i) * -1); + } + } + + double[] covariateRanks = COV_RANKER.rank(covariateLevels); + + int samplesPerGroup = (int) Math.floor(covariateRanks.length * fractionOfSamplesPerGroup); + + System.out.println("Samples per group: " + samplesPerGroup); + + double[] dosagesLow = new double[samplesPerGroup]; + double[] expressionLow = new double[samplesPerGroup]; + + double[] dosagesHigh = new double[samplesPerGroup]; + double[] expressionHigh = new double[samplesPerGroup]; + + for (int i = 0; i < samplesPerGroup; ++i) { + dosagesLow[i] = dosages.get((int) covariateRanks[i]); + expressionLow[i] = expressionLevels[(int) covariateRanks[i]]; + dosagesHigh[i] = dosages.get((int) covariateRanks[covariateRanks.length - 1 - i]); + expressionHigh[i] = expressionLevels[(int) covariateRanks[covariateRanks.length - 1 - i]]; + } + + double rhoLow = spearmanCalculator.correlation(dosagesLow, expressionLow); + double rhoHigh = spearmanCalculator.correlation(dosagesHigh, expressionHigh); + + System.out.println("rho low:" + rhoLow); + System.out.println("rho high:" + rhoHigh); + + return new EffectDiffResult(rhoLow, rhoHigh); + + } + + static class EffectDiffResult { + + private final double rhoLow; + private final double rhoHigh; + + public EffectDiffResult(double rhoLow, double rhoHigh) { + this.rhoLow = rhoLow; + this.rhoHigh = rhoHigh; + } + + public double getRhoLow() { + return rhoLow; + } + + public double getRhoHigh() { + return rhoHigh; + } + } +} diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisMultiThreaded.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisMultiThreaded.java index 824edf188..87caf1e66 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisMultiThreaded.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisMultiThreaded.java @@ -5,36 +5,21 @@ package eqtlmappingpipeline.interactionanalysis; import eqtlmappingpipeline.Main; -import org.molgenis.genotype.Allele; -import umcg.genetica.graphics.ScatterPlot; import eqtlmappingpipeline.normalization.Normalizer; import gnu.trove.map.hash.THashMap; - -import java.io.File; -import java.io.IOException; -import java.util.*; -import java.util.concurrent.CompletionService; -import java.util.concurrent.ExecutorCompletionService; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; import org.apache.commons.math3.stat.correlation.SpearmansCorrelation; -import org.rosuda.REngine.REXP; -import org.rosuda.REngine.RFactor; +import org.molgenis.genotype.Allele; import org.rosuda.REngine.Rserve.RConnection; import org.rosuda.REngine.Rserve.RserveException; import umcg.genetica.console.ProgressBar; import umcg.genetica.containers.Pair; +import umcg.genetica.graphics.ScatterPlot; import umcg.genetica.io.Gpio; import umcg.genetica.io.binInteraction.*; import umcg.genetica.io.binInteraction.gene.BinaryInteractionGeneCreator; import umcg.genetica.io.binInteraction.variant.BinaryInteractionVariantCreator; import umcg.genetica.io.text.TextFile; -import umcg.genetica.io.trityper.SNP; -import umcg.genetica.io.trityper.SNPLoader; -import umcg.genetica.io.trityper.TriTyperExpressionData; -import umcg.genetica.io.trityper.TriTyperGeneticalGenomicsDataset; -import umcg.genetica.io.trityper.TriTyperGeneticalGenomicsDatasetSettings; -import umcg.genetica.io.trityper.TriTyperGenotypeData; +import umcg.genetica.io.trityper.*; import umcg.genetica.math.matrix.DoubleMatrixDataset; import umcg.genetica.math.stats.Correlation; import umcg.genetica.math.stats.Descriptives; @@ -42,8 +27,15 @@ import umcg.genetica.math.stats.QuantileNormalization; import umcg.genetica.math.stats.concurrent.ConcurrentCorrelation; +import java.io.File; +import java.io.IOException; +import java.util.*; +import java.util.concurrent.CompletionService; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + /** - * * @author harm-jan Multi-threaded implementation of the OLS model */ public class InteractionAnalysisMultiThreaded { @@ -189,7 +181,7 @@ public void prepareDataForCelltypeSpecificEQTLMapping(String inexpraw, String ou // 4. PCA on sample correlation matrix rawExpressionDataset.transposeDataset(); // put samples back on columns // this method returns two DoubleMatrixDatasets: left are the PC scores, right are the Eigenvalues and expects the samples to be on the columns - Pair, DoubleMatrixDataset> PCAResults = n.calculatePCA(rawExpressionDataset, sampleCorrelationMatrix, expressionOutputDirectory + "PCAResults", 1); + Pair, DoubleMatrixDataset> PCAResults = n.calculatePCA(rawExpressionDataset, sampleCorrelationMatrix, expressionOutputDirectory + "PCAResults", 2); // 5. Correlate samples with PC1 - scores (QC step to determine poor RNA samples) // This dataset needs to be transposed if rows are currently PCs, and columns contain samples. @@ -297,7 +289,7 @@ public void prepareDataForCelltypeSpecificEQTLMapping(String inexpraw, String ou cellTypeSpecificDataset.transposeDataset(); // calculate first Principal Component over the cell type specific probe matrix... - PCAResults = n.calculatePCA(cellTypeSpecificDataset, celltypeSpecificCorrelationMatrix, outdirectory + "CellTypeSpecificProbePCA", 1); + PCAResults = n.calculatePCA(cellTypeSpecificDataset, celltypeSpecificCorrelationMatrix, outdirectory + "CellTypeSpecificProbePCA", cellTypeSpecificProbeDatasetRowNames.size()); // 10. PC1 scores: cell specific proxy -- write to file for future use... DoubleMatrixDataset cellSpecificPCScores = PCAResults.getLeft(); @@ -398,7 +390,7 @@ public void prepareDataForCelltypeSpecificEQTLMapping(String inexpraw, String ou public void runInteractionAnalysis(String inExpPCCorrected, String covariateFile, String ingt, String gte, String snpprobecombinationfile, Integer nrThreads, String out, - String covariateList, boolean robustSE, boolean fullStats, boolean binaryOutput, String cohort) throws IOException, Exception { + String covariateList, boolean forceNormalDistribution, boolean robustSE, boolean fullStats, boolean binaryOutput, String cohort) throws IOException, Exception { String probeannot = null; double mafthreshold = 0.05; @@ -413,15 +405,7 @@ public void runInteractionAnalysis(String inExpPCCorrected, String covariateFile System.out.println("Running tests for robust standard errors. Now testing R connection"); try { RConnection rConnection = new RConnection(); -// rConnection.voidEval("install.packages('sandwich')"); - System.out.println("R server found: "+rConnection.getServerVersion()); -// REXP result = rConnection.eval("library(sandwich,logical.return=TRUE)"); -// boolean sandwichpresent = result.asBool(); -// if(!sandwichpresent){ -// System.err.println("Library sandwich not installed, which is required for robust SE estimation."); -// } - - + System.out.println("R server found: " + rConnection.getServerVersion()); rConnection.close(); } catch (RserveException ex) { System.err.println(ex.getMessage()); @@ -482,7 +466,6 @@ public void runInteractionAnalysis(String inExpPCCorrected, String covariateFile Set covariateHash = null; if (covariateList != null) { TextFile tfcovariatelist = new TextFile(covariateList, TextFile.R); - covariateHash = tfcovariatelist.readAsSet(0, TextFile.tab); tfcovariatelist.close(); } @@ -514,6 +497,8 @@ public void runInteractionAnalysis(String inExpPCCorrected, String covariateFile // since the number of samples has changed, we might need to reperform q-norm and log2 transform... // it may be a good idea to remove these last steps from the normalization step.. + + // investigate which SNPs to run.. LinkedHashSet> snpProbeCombinationsToTest = new LinkedHashSet>(); HashSet snpsPassingQC = new HashSet(); @@ -541,7 +526,7 @@ public void runInteractionAnalysis(String inExpPCCorrected, String covariateFile snpsPassingQC.add(snp); snpProbeCombinationsToTest.add(p); - if (binaryOutput){ + if (binaryOutput) { snpStats.put(snp, snpObj); } } else { @@ -571,36 +556,45 @@ public void runInteractionAnalysis(String inExpPCCorrected, String covariateFile ArrayList rowNames = new ArrayList(); rowNames.addAll(covariateData.rowObjects); -// if (cellcounts != null) { -// rowNames.add("CellTypeSNPZScore"); -// rowNames.add("CellTypeZScore"); -// rowNames.add("CellTypeInteractionZScore"); -// rowNames.add("MainEffectZScore"); -// } + Correlation.correlationToZScore(covariateData.nrCols); -// DoubleMatrixDataset datasetOut = new DoubleMatrixDataset(rowNames.size(), snpProbeCombinationsToTest.size()); System.out.println("Output matrix will be " + snpProbeCombinationsToTest.size() + "(x5) x " + rowNames.size()); -// datasetOut.rowObjects = rowNames; -// ArrayList colNames = new ArrayList(); double[][] expressiondata = pcCorrectedExpressionData.getMatrix(); int[] wgaId = ds.getExpressionToGenotypeIdArray(); + if (forceNormalDistribution) { + System.out.println("Forcing normal distribution on covariate and expression data"); + System.out.println("Warning: normal distribution is forced before covariate samples are matched to genotypes."); + System.out.println("Make sure that the number of samples between samples and covariates are more or less equal"); + System.out.println("Currently: " + pcCorrectedExpressionData.getColNames().length + " for expression and " + covariateData.nrCols + " for covariates"); + + Normalizer norm = new Normalizer(); + + for (int row = 0; row < expressiondata.length; row++) { + expressiondata[row] = norm.forceNormal(expressiondata[row]); + } + + double[][] covariates = covariateData.getRawData(); + for (int row = 0; row < covariates.length; row++) { + covariates[row] = norm.forceNormal(covariates[row]); + } + covariateData.setRawData(covariates); + System.out.println("Done. And you have been warned."); + } + + TextFile snpFile = new TextFile(out + "SNPSummaryStatistics.txt", TextFile.W); snpFile.writeln("SNP\tChr\tChrPos\tAlleles\tMinorAllele\tMAF\tCallRate\tHWE\tGenotypesCalled"); -// TextFile proxyEffectFile = null; -// if (cellcounts != null) { -// proxyEffectFile = new TextFile(out + "CelltypeSpecificEQTLEffects.txt", TextFile.W); -// proxyEffectFile.writeln("#/#\tSNP\tProbe\tnrCalled\tCorrelation\tanovaFTestP\tbetaInteraction\tseInteraction\ttInteraction\tpValueInteraction\tzScoreInteraction"); -// } String[] snpsPassingQCArr = snpsPassingQC.toArray(new String[0]); int nrSubmitted = 0; if (nrThreads == null) { nrThreads = Runtime.getRuntime().availableProcessors(); } + System.out.println("Running with: " + nrThreads + " threads"); ExecutorService threadPool = Executors.newFixedThreadPool(nrThreads); @@ -611,18 +605,18 @@ public void runInteractionAnalysis(String inExpPCCorrected, String covariateFile BinaryInteractionFile binaryInteractionFile = null; // Write binary output header - if (binaryOutput){ + if (binaryOutput) { File binaryOutFile = new File(out + "InteractionResults.binary.dat"); - String description = "Genotypes: " + ingt + - " Expresion: " + inExpPCCorrected + - " GTE: " + gte + - " Covariates: " + covariateFile + - " Covariates List: " + covariateList + + String description = "Genotypes: " + ingt + + " Expresion: " + inExpPCCorrected + + " GTE: " + gte + + " Covariates: " + covariateFile + + " Covariates List: " + covariateList + " SNP-probes: " + snpprobecombinationfile + " Software version: " + Main.VERSION; - binaryInteractionFile = createBinaryOutputHeader(binaryOutFile, snpsPassingQCArr, snpStats, snpProbeCombinationsToTest, covariateData, expressionIndividualsInPCCorrectedData, cohort, description); - } - else{ + binaryInteractionFile = createBinaryOutputHeader(binaryOutFile, snpsPassingQCArr, snpStats, + snpProbeCombinationsToTest, covariateData, expressionIndividualsInPCCorrectedData, cohort, description); + } else { System.out.println("Output will be written to: " + out + "InteractionResults.txt"); outputFile = new TextFile(out + "InteractionResults.txt", TextFile.W); String outputheader = "SNP\tProbe\tCovariate\tZ-SNP\tZ-Cov\tZ-Interaction\tZ-Main\tZ-Interaction-Flipped\tN\tRSquared"; @@ -639,7 +633,9 @@ public void runInteractionAnalysis(String inExpPCCorrected, String covariateFile outputFile.writeln(outputheader); } - ProgressBar pb = new ProgressBar(snpProbeCombinationsToTest.size(), "Now testing available eQTL effects for cell type specificity."); + + + ProgressBar pb = new ProgressBar(snpProbeCombinationsToTest.size(), "Now testing available eQTL effects for interactions."); int maxbuffer = (nrThreads * 8); for (int i = 0; i < snpsPassingQCArr.length; i++) { String snp = snpsPassingQCArr[i]; @@ -729,7 +725,7 @@ public void runInteractionAnalysis(String inExpPCCorrected, String covariateFile snpFile.close(); - if (binaryOutput){ + if (binaryOutput) { binaryInteractionFile.finalizeWriting(); System.out.println("Interaction results writer buffer flushed: " + binaryInteractionFile.getInteractionWriteBufferFlushed()); System.out.println("QTL results writer buffer flushed: " + binaryInteractionFile.getQtlWriteBufferFlushed()); @@ -737,14 +733,13 @@ public void runInteractionAnalysis(String inExpPCCorrected, String covariateFile System.out.println("Total number of writen interactions: " + binaryInteractionFile.getInteractionZscoresSet()); System.out.println("Number of QTL z-scores: " + binaryInteractionFile.getVariantCount()); binaryInteractionFile.close(); - - if(binaryInteractionFile.getInteractionZscoresSet() != binaryInteractionFile.getTotalNumberInteractions()){ + + if (binaryInteractionFile.getInteractionZscoresSet() != binaryInteractionFile.getTotalNumberInteractions()) { System.out.println("WARNING!!! written and expected interactions not the same"); System.err.println("WARNING!!! written and expected interactions not the same"); } - - } - else{ + + } else { outputFile.close(); } // datasetOut.colObjects = colNames; @@ -798,8 +793,8 @@ private void processResult(InteractionAnalysisResults result, TextFile outputFil double mainZ = maineffectZResultMatrix[e][c]; builder.append(mainZ); - if(mainZ < 0){ - interactionZ*=-1; + if (mainZ < 0) { + interactionZ *= -1; } builder.append("\t"); builder.append(interactionZ); @@ -824,8 +819,8 @@ private void processResult(InteractionAnalysisResults result, TextFile outputFil builder.append("\t"); builder.append(interactionSE[e][c]); - if(mainZ<0){ - interactionB*=-1; + if (mainZ < 0) { + interactionB *= -1; } builder.append("\t"); builder.append(interactionB); @@ -858,7 +853,7 @@ private BinaryInteractionFile processResultWriteBinaryOutput(InteractionAnalysis //main effect z-score double mainZ = maineffectZResultMatrix[e][0]; - BinaryInteractionQtlZscores qtlZscore = new BinaryInteractionQtlZscores(new double[] {mainZ}, new int[] {numSamples}); + BinaryInteractionQtlZscores qtlZscore = new BinaryInteractionQtlZscores(new double[]{mainZ}, new int[]{numSamples}); createdInteractions.setQtlResults(snp, gene, qtlZscore); for (int c = 0; c < SNPZResultMatrix[e].length; c++) { String covariate = covariateData.rowObjects.get(c); @@ -870,8 +865,8 @@ private BinaryInteractionFile processResultWriteBinaryOutput(InteractionAnalysis final double[] zscoreCovariateCohort = {covariateZResultMatrix[e][c]}; final double[] zscoreInteractionCohort = {interactionZ}; final double[] rSquaredCohort = {rsquared[e][c]}; - if(mainZ < 0){ - interactionZ*=-1; + if (mainZ < 0) { + interactionZ *= -1; } final double[] zscoreInteractionFlippedCohort = {interactionZ}; @@ -887,7 +882,7 @@ private BinaryInteractionFile processResultWriteBinaryOutput(InteractionAnalysis private BinaryInteractionFile createBinaryOutputHeader(File binaryOutFile, String[] snpsPassingQCArr, HashMap snpStats, LinkedHashSet> snpProbeCombinationsToTest, DoubleMatrixDataset covariateData, HashSet expressionIndividualsInPCCorrectedData, String cohort, String description) throws BinaryInteractionFileException, IOException { LinkedHashSet geneIds = new LinkedHashSet(); System.out.println("snpProbeCombinationsToTest size: " + snpProbeCombinationsToTest.size()); - for (Pair snpProbePair : snpProbeCombinationsToTest){ + for (Pair snpProbePair : snpProbeCombinationsToTest) { String gene = snpProbePair.getRight(); geneIds.add(gene); } @@ -897,7 +892,7 @@ private BinaryInteractionFile createBinaryOutputHeader(File binaryOutFile, Strin //fill variants BinaryInteractionVariantCreator[] variants = new BinaryInteractionVariantCreator[numSNPs]; - for (int snpIdx = 0; snpIdx < numSNPs; snpIdx++ ){ + for (int snpIdx = 0; snpIdx < numSNPs; snpIdx++) { String snpId = snpsPassingQCArr[snpIdx]; SNP snpObj = snpStats.get(snpId); @@ -909,13 +904,14 @@ private BinaryInteractionFile createBinaryOutputHeader(File binaryOutFile, Strin else majorAllele = alleles[0]; - variants[snpIdx] = new BinaryInteractionVariantCreator(snpId, snpObj.getChr() + "", snpObj.getChrPos(), Allele.create((char) majorAllele), Allele.create((char)minorAllele)); + variants[snpIdx] = new BinaryInteractionVariantCreator(snpId, snpObj.getChr() + "", snpObj.getChrPos(), Allele.create((char) majorAllele), Allele.create((char) minorAllele)); + } //fill genes BinaryInteractionGeneCreator[] genes = new BinaryInteractionGeneCreator[numGenes]; int geneIdx = 0; - for (String gene : geneIds){ + for (String gene : geneIds) { genes[geneIdx] = new BinaryInteractionGeneCreator(gene); geneIdx++; } @@ -926,7 +922,7 @@ private BinaryInteractionFile createBinaryOutputHeader(File binaryOutFile, Strin //fill cohort int numSamples = 0; - for (String s : expressionIndividualsInPCCorrectedData){ + for (String s : expressionIndividualsInPCCorrectedData) { if (covariateData.hashCols.containsKey(s)) numSamples++; } @@ -935,10 +931,10 @@ private BinaryInteractionFile createBinaryOutputHeader(File binaryOutFile, Strin // initialize BinaryInteractionFileCreator creator = new BinaryInteractionFileCreator(binaryOutFile, variants, genes, cohorts, covariates, true, false, true, true); - + creator.setDescription(description); - for (Pair eqtl : snpProbeCombinationsToTest){ + for (Pair eqtl : snpProbeCombinationsToTest) { creator.addTestedVariantGene(eqtl.getLeft(), eqtl.getRight()); } BinaryInteractionFile createdInteractions = creator.create(); diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisResults.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisResults.java index 0e1abfaee..95fe9d84a 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisResults.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisResults.java @@ -5,7 +5,12 @@ package eqtlmappingpipeline.interactionanalysis; import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.LinkedHashSet; + import umcg.genetica.containers.Pair; +import umcg.genetica.io.trityper.SNP; /** * @@ -86,6 +91,7 @@ public class InteractionAnalysisResults { this.rsquared = rsquaredMatrix; } + public String getQcString() { return qcString; } @@ -142,4 +148,24 @@ public double[][] getCovariateSE() { return covariateSE; } + public ArrayList getProbeIds() { + ArrayList probeIds = new ArrayList(); + + for (Pair eqtl : eQTLsTested){ + String gene = eqtl.getRight(); + if (! probeIds.contains(gene)) + probeIds.add(gene); + } + return probeIds; + } + public ArrayList getSNPIds() { + ArrayList snpIds = new ArrayList(); + + for (Pair eqtl : eQTLsTested){ + String snp = eqtl.getLeft(); + if (! snpIds.contains(snp)) + snpIds.add(snp); + } + return snpIds; + } } diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisTask.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisTask.java index ed6fbb9e0..b74ac36d5 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisTask.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionAnalysisTask.java @@ -5,8 +5,6 @@ package eqtlmappingpipeline.interactionanalysis; import cern.jet.random.tdouble.StudentT; -import java.util.ArrayList; -import java.util.concurrent.Callable; import org.apache.commons.math3.linear.SingularMatrixException; import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression; import org.rosuda.REngine.REXPMismatchException; @@ -21,251 +19,257 @@ import umcg.genetica.math.matrix.DoubleMatrixDataset; import umcg.genetica.math.stats.Correlation; +import java.util.ArrayList; +import java.util.concurrent.Callable; + /** - * * @author harmjan */ public class InteractionAnalysisTask implements Callable { - private SNP eQTLSNPObj; - private final double[][] pcCorrectedExpressionData; - private final int[] wgaId; - private final String[] expInds; - private final DoubleMatrixDataset covariateData; - private final TriTyperExpressionData expressionData; - private final ArrayList> eQTLsForSNP; - - private final boolean sandwich; - private final boolean provideFullStats; - + private SNP eQTLSNPObj; + private final double[][] pcCorrectedExpressionData; + private final int[] wgaId; + private final String[] expInds; + private final DoubleMatrixDataset covariateData; + private final TriTyperExpressionData expressionData; + private final ArrayList> eQTLsForSNP; + + private final boolean sandwich; + private final boolean provideFullStats; + private final Pair NAN_PAIR = new Pair(Double.NaN, Double.NaN); - public InteractionAnalysisTask(SNP snpObj, ArrayList> eQTLsForSNP, double[][] pcCorrectedData, - int[] wgaId, - String[] expInds, DoubleMatrixDataset covariateData, - TriTyperExpressionData expressionData, boolean robustSE, boolean provideFullStats) { - this.eQTLSNPObj = snpObj; - this.eQTLsForSNP = eQTLsForSNP; - this.pcCorrectedExpressionData = pcCorrectedData; - this.wgaId = wgaId; - this.expInds = expInds; - this.expressionData = expressionData; - this.covariateData = covariateData; - this.sandwich = robustSE; - this.provideFullStats = provideFullStats; - } - - @Override - public InteractionAnalysisResults call() throws Exception { - - ArrayList> eQTLsTested = new ArrayList>(); - - int nrTotalCovariates = covariateData.nrRows; - - double[][] interactionZScoreMatrix = new double[eQTLsForSNP.size()][nrTotalCovariates]; - - double[][] SNPZResultMatrix = new double[eQTLsForSNP.size()][nrTotalCovariates]; - double[][] covariateZResultMatrix = new double[eQTLsForSNP.size()][nrTotalCovariates]; - double[][] maineffectZResultMatrix = new double[eQTLsForSNP.size()][nrTotalCovariates]; - double[][] interactionBeta = null; - - double[][] interactionSE = null; - double[][] mainBeta = null; - double[][] mainSE = null; - double[][] covariateBeta = null; - double[][] covariateSE = null; - int[][] nMatrix = new int[eQTLsForSNP.size()][nrTotalCovariates]; - double[][] rsquaredMatrix = new double[eQTLsForSNP.size()][nrTotalCovariates]; - if (provideFullStats) { - - interactionBeta = new double[eQTLsForSNP.size()][nrTotalCovariates]; - interactionSE = new double[eQTLsForSNP.size()][nrTotalCovariates]; - mainBeta = new double[eQTLsForSNP.size()][nrTotalCovariates]; - mainSE = new double[eQTLsForSNP.size()][nrTotalCovariates]; - covariateBeta = new double[eQTLsForSNP.size()][nrTotalCovariates]; - covariateSE = new double[eQTLsForSNP.size()][nrTotalCovariates]; - } - - //We are using a coding system that uses the minor allele. - //If allele2 is not the minor allele, change the sign of the results we will output. - double signInteractionEffectDirection = 1; - if (eQTLSNPObj.getAlleles()[1] == eQTLSNPObj.getMinorAllele()) { - signInteractionEffectDirection = -1; - } - - String qcString = null; - Integer nrGenotypesCalled = null; - - org.apache.commons.math3.distribution.FDistribution fDist = null; - cern.jet.random.tdouble.engine.DoubleRandomEngine randomEngine = null; - cern.jet.random.tdouble.StudentT tDistColt = null; - - OLSMultipleLinearRegression regressionFullWithInteraction = new OLSMultipleLinearRegression(); - - for (int e = 0; e < eQTLsForSNP.size(); e++) { - Pair eqtl = eQTLsForSNP.get(e); - String eQTLProbeName = eqtl.getRight(); - - eQTLsTested.add(eqtl); - - Integer eQTLProbeId = expressionData.getProbeToId().get(eQTLProbeName); - - double[] valsX = eQTLSNPObj.selectGenotypes(wgaId, true, true); // this is sorted on expression ID - double[] valsY = pcCorrectedExpressionData[eQTLProbeId]; //Expression level - - for (int covariate = 0; covariate < nrTotalCovariates; covariate++) { - double[] tmpVarCelCount = new double[valsY.length]; - - for (int i = 0; i < tmpVarCelCount.length; i++) { - String sampleName = expInds[i]; - Integer individualIdInCovariateData = covariateData.hashCols.get(sampleName); - if (individualIdInCovariateData != null) { - // presorting greatly speeds this stuff up - tmpVarCelCount[i] = covariateData.rawData[covariate][individualIdInCovariateData]; - } else { - tmpVarCelCount[i] = Double.NaN; - } - } - - //Check whether all the expression samples have a genotype and a cell count... - int nrCalled = 0; - for (int i = 0; i < wgaId.length; i++) { - if (wgaId[i] != -1 && !Double.isNaN(tmpVarCelCount[i]) && valsX[i] != -1) { - nrCalled++; - } - } - - // THIS WILL GIVE ERRONEOUS VALUES WHEN THERE ARE MISSING - // VALUES IN VALSY THE NEXT TIME THIS SNP IS TESTED!! - // this value is required for subsequent meta-analysis.. fix for altering sample sizes (take smallest size / omit missing values) - // in stead use the value for N that is now in the standard output. - double[] genotypesCalled = new double[nrCalled]; - if (qcString == null) { - qcString = eQTLSNPObj.getName() + "\t" + ChrAnnotation.parseByte(eQTLSNPObj.getChr()) + "\t" + eQTLSNPObj.getChrPos() + "\t" + BaseAnnot.toString(eQTLSNPObj.getAlleles()[0]) + "/" + BaseAnnot.toString(eQTLSNPObj.getAlleles()[1]) + "\t" + BaseAnnot.toString(eQTLSNPObj.getMinorAllele()) + "\t" + eQTLSNPObj.getMAF() + "\t" + eQTLSNPObj.getCR() + "\t" + eQTLSNPObj.getHWEP() + "\t" + genotypesCalled.length; - nrGenotypesCalled = genotypesCalled.length; - } else if (genotypesCalled.length != nrGenotypesCalled) { - - System.err.println("ERROR: the number of available values has changed. Does your gene expression data or cell count file contain missing values?"); - System.exit(0); - } - - double zScoreInteraction = 0; - double zScoreSNP = 0; - double zScoreCovariate = 0; - double mainZ = 0; - - double betaInteraction = 0; - double seInteraction = 0; - double betaSNP = 0; - double seSNP = 0; - double betaCovariate = 0; - double seCovariate = 0; - - double rsquared = 0; - - if (sandwich) { - RConnection rConnection = null; - // this code is very suboptimal and is here for validation purposes only anyway - try { - rConnection = new RConnection(); -// rConnection.voidEval("install.packages('sandwich')"); - rConnection.voidEval("library(sandwich)"); - } catch (RserveException ex) { - System.err.println(ex.getMessage()); - rConnection = null; - } - - if (rConnection == null) { - System.err.println("Error: using R connection but none found"); - } - if (rConnection != null) { - try { - if (rConnection.isConnected()) { - double[] olsY = new double[nrCalled]; //Ordinary least squares: Our gene expression - double[] olsX = new double[nrCalled]; - double[] covariateValues = new double[nrCalled]; + + public InteractionAnalysisTask(SNP snpObj, ArrayList> eQTLsForSNP, double[][] pcCorrectedData, + int[] wgaId, + String[] expInds, DoubleMatrixDataset covariateData, + TriTyperExpressionData expressionData, boolean robustSE, boolean provideFullStats) { + this.eQTLSNPObj = snpObj; + this.eQTLsForSNP = eQTLsForSNP; + this.pcCorrectedExpressionData = pcCorrectedData; + this.wgaId = wgaId; + this.expInds = expInds; + this.expressionData = expressionData; + this.covariateData = covariateData; + this.sandwich = robustSE; + this.provideFullStats = provideFullStats; + + } + + + @Override + public InteractionAnalysisResults call() throws Exception { + + ArrayList> eQTLsTested = new ArrayList>(); + + int nrTotalCovariates = covariateData.nrRows; + + double[][] interactionZScoreMatrix = new double[eQTLsForSNP.size()][nrTotalCovariates]; + + double[][] SNPZResultMatrix = new double[eQTLsForSNP.size()][nrTotalCovariates]; + double[][] covariateZResultMatrix = new double[eQTLsForSNP.size()][nrTotalCovariates]; + double[][] maineffectZResultMatrix = new double[eQTLsForSNP.size()][nrTotalCovariates]; + double[][] interactionBeta = null; + + double[][] interactionSE = null; + double[][] mainBeta = null; + double[][] mainSE = null; + double[][] covariateBeta = null; + double[][] covariateSE = null; + int[][] nMatrix = new int[eQTLsForSNP.size()][nrTotalCovariates]; + double[][] rsquaredMatrix = new double[eQTLsForSNP.size()][nrTotalCovariates]; + if (provideFullStats) { + + interactionBeta = new double[eQTLsForSNP.size()][nrTotalCovariates]; + interactionSE = new double[eQTLsForSNP.size()][nrTotalCovariates]; + mainBeta = new double[eQTLsForSNP.size()][nrTotalCovariates]; + mainSE = new double[eQTLsForSNP.size()][nrTotalCovariates]; + covariateBeta = new double[eQTLsForSNP.size()][nrTotalCovariates]; + covariateSE = new double[eQTLsForSNP.size()][nrTotalCovariates]; + } + + //We are using a coding system that uses the minor allele. + //If allele2 is not the minor allele, change the sign of the results we will output. + double signInteractionEffectDirection = 1; + if (eQTLSNPObj.getAlleles()[1] == eQTLSNPObj.getMinorAllele()) { + signInteractionEffectDirection = -1; + } + + String qcString = null; + Integer nrGenotypesCalled = null; + + org.apache.commons.math3.distribution.FDistribution fDist = null; + cern.jet.random.tdouble.engine.DoubleRandomEngine randomEngine = null; + cern.jet.random.tdouble.StudentT tDistColt = null; + + OLSMultipleLinearRegression regressionFullWithInteraction = new OLSMultipleLinearRegression(); + + for (int e = 0; e < eQTLsForSNP.size(); e++) { + Pair eqtl = eQTLsForSNP.get(e); + String eQTLProbeName = eqtl.getRight(); + + eQTLsTested.add(eqtl); + + Integer eQTLProbeId = expressionData.getProbeToId().get(eQTLProbeName); + + double[] valsX = eQTLSNPObj.selectGenotypes(wgaId, true, true); // this is sorted on expression ID + double[] valsY = pcCorrectedExpressionData[eQTLProbeId]; //Expression level + + for (int covariate = 0; covariate < nrTotalCovariates; covariate++) { + double[] tmpVarCelCount = new double[valsY.length]; + + for (int i = 0; i < tmpVarCelCount.length; i++) { + String sampleName = expInds[i]; + Integer individualIdInCovariateData = covariateData.hashCols.get(sampleName); + if (individualIdInCovariateData != null) { + // presorting greatly speeds this stuff up + tmpVarCelCount[i] = covariateData.rawData[covariate][individualIdInCovariateData]; + } else { + tmpVarCelCount[i] = Double.NaN; + } + } + + //Check whether all the expression samples have a genotype and a cell count... + int nrCalled = 0; + for (int i = 0; i < wgaId.length; i++) { + if (wgaId[i] != -1 && !Double.isNaN(tmpVarCelCount[i]) && valsX[i] != -1) { + nrCalled++; + } + } + + // THIS WILL GIVE ERRONEOUS VALUES WHEN THERE ARE MISSING + // VALUES IN VALSY THE NEXT TIME THIS SNP IS TESTED!! + // this value is required for subsequent meta-analysis.. fix for altering sample sizes (take smallest size / omit missing values) + // in stead use the value for N that is now in the standard output. + double[] genotypesCalled = new double[nrCalled]; + if (qcString == null) { + qcString = eQTLSNPObj.getName() + "\t" + ChrAnnotation.parseByte(eQTLSNPObj.getChr()) + "\t" + eQTLSNPObj.getChrPos() + "\t" + BaseAnnot.toString(eQTLSNPObj.getAlleles()[0]) + "/" + BaseAnnot.toString(eQTLSNPObj.getAlleles()[1]) + "\t" + BaseAnnot.toString(eQTLSNPObj.getMinorAllele()) + "\t" + eQTLSNPObj.getMAF() + "\t" + eQTLSNPObj.getCR() + "\t" + eQTLSNPObj.getHWEP() + "\t" + genotypesCalled.length; + nrGenotypesCalled = genotypesCalled.length; + } else if (genotypesCalled.length != nrGenotypesCalled) { + + System.err.println("ERROR: the number of available values has changed. Does your gene expression data or cell count file contain missing values?"); + System.exit(0); + } + + double zScoreInteraction = 0; + double zScoreSNP = 0; + double zScoreCovariate = 0; + double mainZ = 0; + + double betaInteraction = 0; + double seInteraction = 0; + double betaSNP = 0; + double seSNP = 0; + double betaCovariate = 0; + double seCovariate = 0; + + double rsquared = 0; + + if (sandwich) { + RConnection rConnection = null; + // this code is very suboptimal and is here for validation purposes only anyway + try { + rConnection = new RConnection(); + rConnection.voidEval("library(sandwich)"); + } catch (RserveException ex) { + System.err.println(ex.getMessage()); + rConnection = null; + } + + if (rConnection == null) { + System.err.println("Error: using R connection but none found"); + return null; + } + + try { + if (rConnection.isConnected()) { + double[] olsY = new double[nrCalled]; //Ordinary least squares: Our gene expression + double[] olsX = new double[nrCalled]; + double[] covariateValues = new double[nrCalled]; //No interaction term, linear model: y ~ a * SNP + b * CellCount + c // double[][] olsXFullWithInteraction = new double[nrCalled][3]; //With interaction term, linear model: y ~ a * SNP + b * CellCount + c + d * SNP * CellCount - int itr = 0; - for (int s = 0; s < valsX.length; s++) { - double genotype = valsX[s]; - if (genotype != -1 && !Double.isNaN(tmpVarCelCount[s])) { - if (signInteractionEffectDirection == -1) { - genotype = 2 - genotype; - } - covariateValues[itr] = tmpVarCelCount[s]; - olsY[itr] = valsY[s]; - olsX[itr] = genotype; - itr++; - } - } - - double corr = JSci.maths.ArrayMath.correlation(olsX, olsY); - mainZ = Correlation.convertCorrelationToZScore(olsX.length, corr); - - rConnection.assign("y", olsY); - rConnection.assign("x", olsX); - rConnection.assign("z", covariateValues); - rConnection.voidEval("interaction <- x*z"); - rConnection.voidEval("m <- lm(y ~ x + z + interaction)"); - rConnection.voidEval("modelsummary <- summary(m)"); - - rConnection.voidEval("m2 <- sqrt(diag(vcovHC(m, type = 'HC0')))"); // robust covariance model - - if (tDistColt == null) { - randomEngine = new cern.jet.random.tdouble.engine.DRand(); - tDistColt = new cern.jet.random.tdouble.StudentT(olsY.length - 4, randomEngine); - } - - betaInteraction = rConnection.eval("modelsummary$coefficients[4,1]").asDouble(); - seInteraction = rConnection.eval("as.numeric(m2[4])").asDouble(); - betaSNP = rConnection.eval("modelsummary$coefficients[2,1]").asDouble(); - seSNP = rConnection.eval("modelsummary$coefficients[2,2]").asDouble(); - betaCovariate = rConnection.eval("modelsummary$coefficients[3,1]").asDouble(); - seCovariate = rConnection.eval("modelsummary$coefficients[3,2]").asDouble(); - rsquared = rConnection.eval("modelsummary$r.squared").asDouble(); - rConnection.close(); - } else { - System.err.println("ERROR: R is not connected."); - } - - } catch (REngineException ex) { - System.err.println(ex.getMessage()); - } catch (REXPMismatchException ex) { - System.err.println(ex.getMessage()); - } - - } - - } else { - - //Fill arrays with data in order to be able to perform the ordinary least squares analysis: - double[] olsY = new double[nrCalled]; //Ordinary least squares: Our gene expression - - double[][] olsX = new double[nrCalled][2]; //No interaction term, linear model: y ~ a * SNP + b * CellCount + c - double[][] olsXFullWithInteraction = new double[nrCalled][3]; //With interaction term, linear model: y ~ a * SNP + b * CellCount + c + d * SNP * CellCount - int itr = 0; - for (int s = 0; s < valsX.length; s++) { - double genotype = valsX[s]; - if (genotype != -1 && !Double.isNaN(tmpVarCelCount[s])) { - if (signInteractionEffectDirection == -1) { - genotype = 2 - genotype; - } - genotypesCalled[itr] = genotype; - olsY[itr] = valsY[s]; - olsX[itr][0] = genotype; - olsXFullWithInteraction[itr][0] = genotype; - olsX[itr][1] = tmpVarCelCount[s]; - olsXFullWithInteraction[itr][1] = tmpVarCelCount[s]; - olsXFullWithInteraction[itr][2] = olsXFullWithInteraction[itr][0] * olsXFullWithInteraction[itr][1]; - itr++; - } - } + int itr = 0; + for (int s = 0; s < valsX.length; s++) { + double genotype = valsX[s]; + if (genotype != -1 && !Double.isNaN(tmpVarCelCount[s])) { + if (signInteractionEffectDirection == -1) { + genotype = 2 - genotype; + } + covariateValues[itr] = tmpVarCelCount[s]; + olsY[itr] = valsY[s]; + olsX[itr] = genotype; + itr++; + } + } + + double corr = JSci.maths.ArrayMath.correlation(olsX, olsY); + mainZ = Correlation.convertCorrelationToZScore(olsX.length, corr); + + + rConnection.assign("y", olsY); + rConnection.assign("x", olsX); + rConnection.assign("z", covariateValues); + rConnection.voidEval("interaction <- x*z"); + rConnection.voidEval("m <- lm(y ~ x + z + interaction)"); + rConnection.voidEval("modelsummary <- summary(m)"); + + rConnection.voidEval("m2 <- sqrt(diag(vcovHC(m, type = 'HC0')))"); // robust covariance model + + if (tDistColt == null) { + randomEngine = new cern.jet.random.tdouble.engine.DRand(); + tDistColt = new cern.jet.random.tdouble.StudentT(olsY.length - 4, randomEngine); + } + + betaInteraction = rConnection.eval("modelsummary$coefficients[4,1]").asDouble(); + seInteraction = rConnection.eval("as.numeric(m2[4])").asDouble(); + betaSNP = rConnection.eval("modelsummary$coefficients[2,1]").asDouble(); + seSNP = rConnection.eval("modelsummary$coefficients[2,2]").asDouble(); + betaCovariate = rConnection.eval("modelsummary$coefficients[3,1]").asDouble(); + seCovariate = rConnection.eval("modelsummary$coefficients[3,2]").asDouble(); + rsquared = rConnection.eval("modelsummary$r.squared").asDouble(); + + rConnection.close(); + } else { + System.err.println("ERROR: R is not connected."); + } + + } catch (REngineException ex) { + System.err.println(ex.getMessage()); + } catch (REXPMismatchException ex) { + System.err.println(ex.getMessage()); + } + + } else { + + //Fill arrays with data in order to be able to perform the ordinary least squares analysis: + double[] olsY = new double[nrCalled]; //Ordinary least squares: Our gene expression + + double[][] olsX = new double[nrCalled][2]; //No interaction term, linear model: y ~ a * SNP + b * CellCount + c + double[][] olsXFullWithInteraction = new double[nrCalled][3]; //With interaction term, linear model: y ~ a * SNP + b * CellCount + c + d * SNP * CellCount + int itr = 0; + for (int s = 0; s < valsX.length; s++) { + double genotype = valsX[s]; + if (genotype != -1 && !Double.isNaN(tmpVarCelCount[s])) { + if (signInteractionEffectDirection == -1) { + genotype = 2 - genotype; + } + genotypesCalled[itr] = genotype; + olsY[itr] = valsY[s]; + olsX[itr][0] = genotype; + olsXFullWithInteraction[itr][0] = genotype; + olsX[itr][1] = tmpVarCelCount[s]; + olsXFullWithInteraction[itr][1] = tmpVarCelCount[s]; + olsXFullWithInteraction[itr][2] = olsXFullWithInteraction[itr][0] * olsXFullWithInteraction[itr][1]; + itr++; + } + } + // regression.newSampleData(olsY, olsX); - regressionFullWithInteraction.newSampleData(olsY, olsXFullWithInteraction); + regressionFullWithInteraction.newSampleData(olsY, olsXFullWithInteraction); - // not sure if this is needed right now, but I will keep it in for later use. + // not sure if this is needed right now, but I will keep it in for later use. // double rss1 = regression.calculateResidualSumOfSquares(); // double rss2 = regressionFullWithInteraction.calculateResidualSumOfSquares(); // double anovaF = ((rss1 - rss2) / (3 - 2)) / (rss2 / (olsY.length - 3)); @@ -284,33 +288,32 @@ public InteractionAnalysisResults call() throws Exception { // } // } catch (Exception err) { // } - if (tDistColt == null) { - randomEngine = new cern.jet.random.tdouble.engine.DRand(); - tDistColt = new cern.jet.random.tdouble.StudentT(olsY.length - 4, randomEngine); - } - - // double intersect = regressionParameters[0]; - double corr = JSci.maths.ArrayMath.correlation(genotypesCalled, olsY); - mainZ = Correlation.convertCorrelationToZScore(genotypesCalled.length, corr); - - + if (tDistColt == null) { + randomEngine = new cern.jet.random.tdouble.engine.DRand(); + tDistColt = new cern.jet.random.tdouble.StudentT(olsY.length - 4, randomEngine); + } + + // double intersect = regressionParameters[0]; + double corr = JSci.maths.ArrayMath.correlation(genotypesCalled, olsY); + mainZ = Correlation.convertCorrelationToZScore(genotypesCalled.length, corr); + // Get the regression parameters and R-square value and print it. - try { + try { double[] regressionParameters = regressionFullWithInteraction.estimateRegressionParameters(); double[] regressionStandardErrors = regressionFullWithInteraction.estimateRegressionParametersStandardErrors(); - + betaInteraction = regressionParameters[3]; seInteraction = regressionStandardErrors[3]; // Get the regression parameters and R-square value and print it. betaSNP = regressionParameters[1]; seSNP = regressionStandardErrors[1]; - + betaCovariate = regressionParameters[2]; seCovariate = regressionStandardErrors[2]; - + rsquared = regressionFullWithInteraction.calculateRSquared(); - + } catch (SingularMatrixException ex) { betaInteraction = Double.NaN; seInteraction = Double.NaN; @@ -318,108 +321,108 @@ public InteractionAnalysisResults call() throws Exception { // Get the regression parameters and R-square value and print it. betaSNP = Double.NaN; seSNP = Double.NaN; - + betaCovariate = Double.NaN; seCovariate = Double.NaN; - + rsquared = Double.NaN; } - } - - Pair pair = convertBetaToP(betaInteraction, seInteraction, tDistColt); - double pValueInteraction = pair.getLeft(); - zScoreInteraction = pair.getRight(); - - pair = convertBetaToP(betaSNP, seSNP, tDistColt); - double pValueSNP = pair.getLeft(); - zScoreSNP = pair.getRight(); - - // Get the regression parameters and R-square value and print it. - pair = convertBetaToP(betaCovariate, seCovariate, tDistColt); - double pValueCovariate = pair.getLeft(); - zScoreCovariate = pair.getRight(); - - interactionZScoreMatrix[e][covariate] = zScoreInteraction; - SNPZResultMatrix[e][covariate] = zScoreSNP; - covariateZResultMatrix[e][covariate] = zScoreCovariate; - maineffectZResultMatrix[e][covariate] = mainZ; - nMatrix[e][covariate] = nrCalled; - rsquaredMatrix[e][covariate] = rsquared; - - // flip the covariate effect according to the main effect - if (provideFullStats) { - interactionBeta[e][covariate] = betaInteraction; - interactionSE[e][covariate] = seInteraction; - mainBeta[e][covariate] = betaSNP; - mainSE[e][covariate] = seSNP; - covariateBeta[e][covariate] = betaCovariate; - covariateSE[e][covariate] = seCovariate; - - } - } - } - - eQTLSNPObj.clearGenotypes(); - eQTLSNPObj = null; - - if (provideFullStats) { - - return new InteractionAnalysisResults( - qcString, - eQTLsTested, - interactionZScoreMatrix, - SNPZResultMatrix, - covariateZResultMatrix, - maineffectZResultMatrix, - interactionBeta, - interactionSE, - mainBeta, - mainSE, - covariateBeta, - covariateSE, - nMatrix, - rsquaredMatrix); - } else { - return new InteractionAnalysisResults( - qcString, - eQTLsTested, - interactionZScoreMatrix, - SNPZResultMatrix, - covariateZResultMatrix, - maineffectZResultMatrix, - nMatrix, - rsquaredMatrix); - - } - - } - - private Pair convertBetaToP(double beta, double se, StudentT tDistColt) { - - if(Double.isNaN(beta)){ + } + + Pair pair = convertBetaToP(betaInteraction, seInteraction, tDistColt); + double pValueInteraction = pair.getLeft(); + zScoreInteraction = pair.getRight(); + + pair = convertBetaToP(betaSNP, seSNP, tDistColt); + double pValueSNP = pair.getLeft(); + zScoreSNP = pair.getRight(); + + // Get the regression parameters and R-square value and print it. + pair = convertBetaToP(betaCovariate, seCovariate, tDistColt); + double pValueCovariate = pair.getLeft(); + zScoreCovariate = pair.getRight(); + + interactionZScoreMatrix[e][covariate] = zScoreInteraction; + SNPZResultMatrix[e][covariate] = zScoreSNP; + covariateZResultMatrix[e][covariate] = zScoreCovariate; + maineffectZResultMatrix[e][covariate] = mainZ; + nMatrix[e][covariate] = nrCalled; + rsquaredMatrix[e][covariate] = rsquared; + + // flip the covariate effect according to the main effect + if (provideFullStats) { + interactionBeta[e][covariate] = betaInteraction; + interactionSE[e][covariate] = seInteraction; + mainBeta[e][covariate] = betaSNP; + mainSE[e][covariate] = seSNP; + covariateBeta[e][covariate] = betaCovariate; + covariateSE[e][covariate] = seCovariate; + + } + } + } + + eQTLSNPObj.clearGenotypes(); + eQTLSNPObj = null; + + if (provideFullStats) { + + return new InteractionAnalysisResults( + qcString, + eQTLsTested, + interactionZScoreMatrix, + SNPZResultMatrix, + covariateZResultMatrix, + maineffectZResultMatrix, + interactionBeta, + interactionSE, + mainBeta, + mainSE, + covariateBeta, + covariateSE, + nMatrix, + rsquaredMatrix); + } else { + return new InteractionAnalysisResults( + qcString, + eQTLsTested, + interactionZScoreMatrix, + SNPZResultMatrix, + covariateZResultMatrix, + maineffectZResultMatrix, + nMatrix, + rsquaredMatrix); + + } + + } + + private Pair convertBetaToP(double beta, double se, StudentT tDistColt) { + + if (Double.isNaN(beta)) { return NAN_PAIR; } - - double t = beta / se; - double p = 1; - double z = 0; - if (t < 0) { - p = tDistColt.cdf(t); - if (p < 2.0E-323) { - p = 2.0E-323; - - } - z = cern.jet.stat.Probability.normalInverse(p); - } else { - p = tDistColt.cdf(-t); - if (p < 2.0E-323) { - p = 2.0E-323; - - } - z = -cern.jet.stat.Probability.normalInverse(p); - } - return new Pair(p, z); - } + + double t = beta / se; + double p = 1; + double z = 0; + if (t < 0) { + p = tDistColt.cdf(t); + if (p < 2.0E-323) { + p = 2.0E-323; + + } + z = cern.jet.stat.Probability.normalInverse(p); + } else { + p = tDistColt.cdf(-t); + if (p < 2.0E-323) { + p = 2.0E-323; + + } + z = -cern.jet.stat.Probability.normalInverse(p); + } + return new Pair(p, z); + } } diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionPlotter.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionPlotter.java index 11fe1f87b..955941b52 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionPlotter.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/interactionanalysis/InteractionPlotter.java @@ -171,7 +171,11 @@ public InteractionPlotter(String interactionFile, String genotypeDir, String exp for (int q = startCovariate; q < endCovariate; q++) { System.out.println("Plotting: " + snp + "\t" + covariateData.rowObjects.get(q) + "\t" + probe); - System.out.println("Individual\tAllele1\tAllele2\tGenotype\tGenotypeFlipped\tCovariate\tExpression"); + + + TextFile interactionOut = new TextFile(outdir + snp + "-" + probe + "-" + covariateData.rowObjects.get(q) + ".txt", TextFile.W); + interactionOut.writeln("Individual\tAllele1\tAllele2\tGenotype\tGenotypeFlipped\tCovariate\tExpression"); + byte[] alleles1 = snpObj.getAllele1(); byte[] alleles2 = snpObj.getAllele2(); byte[] genotypes = snpObj.getGenotypes(); @@ -198,7 +202,7 @@ public InteractionPlotter(String interactionFile, String genotypeDir, String exp + "\t" + genotypeflipped + "\t" + covariateData.rawData[q][genotypeToCovariate[i]] + "\t" + expressionData.rawData[probeId][genotypeToExpression[i]]; - System.out.println(output); + interactionOut.writeln(output); genotypeArr.add(genotypes[i]); @@ -208,7 +212,10 @@ public InteractionPlotter(String interactionFile, String genotypeDir, String exp } } + } + interactionOut.close(); + System.out.println(""); //Fill arrays with data in order to be able to perform the ordinary least squares analysis: double[] olsY = new double[nrCalled]; //Ordinary least squares: Our gene expression @@ -241,55 +248,54 @@ public InteractionPlotter(String interactionFile, String genotypeDir, String exp regressionFullWithInteraction.newSampleData(olsY, olsXFullWithInteraction); - try{ - double rss2 = regressionFullWithInteraction.calculateResidualSumOfSquares(); - double[] regressionParameters = regressionFullWithInteraction.estimateRegressionParameters(); - - double[] regressionStandardErrors = regressionFullWithInteraction.estimateRegressionParametersStandardErrors(); - - - double betaInteraction = regressionParameters[3]; - double seInteraction = regressionStandardErrors[3]; - double tInteraction = betaInteraction / seInteraction; - double pValueInteraction = 1; - double zScoreInteraction = 0; - - if (fDist == null) { - fDist = new org.apache.commons.math3.distribution.FDistribution((int) (3 - 2), (int) (olsY.length - 3)); - randomEngine = new cern.jet.random.tdouble.engine.DRand(); - tDistColt = new cern.jet.random.tdouble.StudentT(olsY.length - 4, randomEngine); - } - - if (tInteraction < 0) { - pValueInteraction = tDistColt.cdf(tInteraction); - if (pValueInteraction < 2.0E-323) { - pValueInteraction = 2.0E-323; - } - zScoreInteraction = cern.jet.stat.tdouble.Probability.normalInverse(pValueInteraction); - } else { - pValueInteraction = tDistColt.cdf(-tInteraction); - if (pValueInteraction < 2.0E-323) { - pValueInteraction = 2.0E-323; - } - - zScoreInteraction = -cern.jet.stat.tdouble.Probability.normalInverse(pValueInteraction); - } - pValueInteraction *= 2; - String pvalFormatted = ""; - if (pValueInteraction >= 0.001) { - pvalFormatted = decFormat.format(pValueInteraction); - } else { - pvalFormatted = decFormatSmall.format(pValueInteraction); - } - ScatterPlot scatterPlot = new ScatterPlot(500, 500, dataCov, dataExp, dataGen, genotypeDescriptions, colorarray, ScatterPlot.OUTPUTFORMAT.PDF, - "Interaction between SNP " + snp + ", probe " + probe + " and covariate " + covariateData.rowObjects.get(q), - "Z: " + decFormat.format(zScoreInteraction) + " Pvalue: " + pvalFormatted + " n: " + nrCalled, - outdir + snp + "-" + probe + "-" + covariateData.rowObjects.get(q) + ".pdf", false); - - } catch(SingularMatrixException ex ){ - ex.printStackTrace(); - System.out.println("\tMatrix is singular, skipping\n"); - } + try { + double rss2 = regressionFullWithInteraction.calculateResidualSumOfSquares(); + double[] regressionParameters = regressionFullWithInteraction.estimateRegressionParameters(); + + double[] regressionStandardErrors = regressionFullWithInteraction.estimateRegressionParametersStandardErrors(); + + double betaInteraction = regressionParameters[3]; + double seInteraction = regressionStandardErrors[3]; + double tInteraction = betaInteraction / seInteraction; + double pValueInteraction = 1; + double zScoreInteraction = 0; + + if (fDist == null) { + fDist = new org.apache.commons.math3.distribution.FDistribution((int) (3 - 2), (int) (olsY.length - 3)); + randomEngine = new cern.jet.random.tdouble.engine.DRand(); + tDistColt = new cern.jet.random.tdouble.StudentT(olsY.length - 4, randomEngine); + } + + if (tInteraction < 0) { + pValueInteraction = tDistColt.cdf(tInteraction); + if (pValueInteraction < 2.0E-323) { + pValueInteraction = 2.0E-323; + } + zScoreInteraction = cern.jet.stat.tdouble.Probability.normalInverse(pValueInteraction); + } else { + pValueInteraction = tDistColt.cdf(-tInteraction); + if (pValueInteraction < 2.0E-323) { + pValueInteraction = 2.0E-323; + } + + zScoreInteraction = -cern.jet.stat.tdouble.Probability.normalInverse(pValueInteraction); + } + pValueInteraction *= 2; + String pvalFormatted = ""; + if (pValueInteraction >= 0.001) { + pvalFormatted = decFormat.format(pValueInteraction); + } else { + pvalFormatted = decFormatSmall.format(pValueInteraction); + } + ScatterPlot scatterPlot = new ScatterPlot(500, 500, dataCov, dataExp, dataGen, genotypeDescriptions, colorarray, ScatterPlot.OUTPUTFORMAT.PDF, + "Interaction between SNP " + snp + ", probe " + probe + " and covariate " + covariateData.rowObjects.get(q), + "Z: " + decFormat.format(zScoreInteraction) + " Pvalue: " + pvalFormatted + " n: " + nrCalled, + outdir + snp + "-" + probe + "-" + covariateData.rowObjects.get(q) + ".pdf", false); + + } catch (SingularMatrixException ex) { + ex.printStackTrace(); + System.out.println("\tMatrix is singular, skipping\n"); + } } snpObj.clearGenotypes(); diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/CalculationThread.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/CalculationThread.java index cf8a551eb..98becd2b8 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/CalculationThread.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/CalculationThread.java @@ -333,7 +333,7 @@ private void analyze(WorkPackage wp) { // now push the results in the queue.. try { wp.setNumTested(testsPerformed); - m_result_queue.put(wp); + throwResult(wp); } catch (InterruptedException e) { e.printStackTrace(); } @@ -341,6 +341,8 @@ private void analyze(WorkPackage wp) { // System.out.println("Analyze: "+t1.getTimeDesc()); } + + protected static void test(int d, int p, Integer probeId, double[] x, double[] originalGenotypes, double varianceX, double varianceY, double meanY, boolean[] includeExpressionSample, int sampleCount, double[][] rawData, double[][] covariateRawData, Result r, WorkPackage wp, boolean metaAnalyseModelCorrelationYHat, boolean metaAnalyseInteractionTerms, boolean determinefoldchange) { final double[] y; double[][] covariates = covariateRawData; @@ -826,4 +828,8 @@ private void ploteQTL(WorkPackage wp, int p) { // } // randomNumberGenerator.deflatedZScores = inflatedZScores; // } + + private void throwResult(WorkPackage wp) throws InterruptedException { + m_result_queue.put(wp); + } } diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/MetaQTL3.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/MetaQTL3.java index 500d2aaf9..54e335ad6 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/MetaQTL3.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/MetaQTL3.java @@ -8,7 +8,7 @@ import cern.colt.matrix.tint.IntMatrix2D; import cern.colt.matrix.tint.impl.DenseIntMatrix2D; import cern.colt.matrix.tint.impl.DenseLargeIntMatrix2D; -import com.lowagie.text.DocumentException; +import com.itextpdf.text.DocumentException; import eqtlmappingpipeline.metaqtl3.containers.WorkPackage; import eqtlmappingpipeline.metaqtl3.containers.Result; import umcg.genetica.math.stats.Descriptives; @@ -807,7 +807,7 @@ public void mapEQTLs() throws IOException { expressionToGenotypeIds[d] = m_gg[d].getExpressionToGenotypeIdArray(); } - LinkedBlockingQueue resultQueue = new LinkedBlockingQueue(100); + LinkedBlockingQueue resultQueue = new LinkedBlockingQueue(250); ResultProcessorThread resultthread = new ResultProcessorThread(m_settings.nrThreads, resultQueue, m_settings.createBinaryOutputFiles, m_gg, m_settings, m_probeTranslationTable, permuting, permutationRound, m_snpList, m_probeList, m_workPackages); resultthread.setName("ResultProcessorThread"); diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/ResultProcessorThread.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/ResultProcessorThread.java index 332439da2..4e7562ded 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/ResultProcessorThread.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/ResultProcessorThread.java @@ -47,435 +47,468 @@ public class ResultProcessorThread extends Thread { // private TextFile[] zScoreBinaryFile; // private TextFile zScoreMetaAnalysisFile; // private int m_numdatasets = 0; - long nrZ = 0; - private boolean m_createBinaryFiles = false; - private TriTyperGeneticalGenomicsDataset[] m_gg = null; - private boolean m_cisOnly; - private IntMatrix2D m_probeTranslation; - private int m_midpointprobedist; - private final String m_outputdir; - private final boolean m_permuting; - private final int m_permutationround; - private final boolean m_createTEXTFiles; - private final String[] m_probeList; - private final LinkedBlockingQueue m_queue; - private final WorkPackage[] m_availableWorkPackages; - private long nrTestsPerformed = 0; - private QTL[] finalEQTLs; - private double maxSavedPvalue = -Double.MAX_VALUE; - private int locationToStoreResult = 0; - private boolean bufferHasOverFlown = false; - private boolean sorted = false; - private int m_maxResults = 0; - public double highestP = Double.MAX_VALUE; - private int nrSNPsTested = 0; - private final boolean m_useAbsoluteZScore; - private BinaryFile[] zScoreBinaryFile; - private BinaryFile zScoreMetaAnalysisFile; - private TextFile zScoreMetaAnalysisRowNamesFile; - private TextFile[] zScoreRowNamesFile; - - public ResultProcessorThread(int nrThreads, LinkedBlockingQueue queue, boolean chargeOutput, - TriTyperGeneticalGenomicsDataset[] gg, Settings settings, IntMatrix2D pprobeTranslation, - boolean permuting, int round, String[] snplist, String[] probelist, WorkPackage[] allPackages) { - m_availableWorkPackages = allPackages; - m_createBinaryFiles = settings.createBinaryOutputFiles; - m_createTEXTFiles = settings.createTEXTOutputFiles; - m_useAbsoluteZScore = settings.useAbsoluteZScorePValue; - m_queue = queue; - m_outputdir = settings.outputReportsDir; - m_permuting = permuting; - m_permutationround = round; - m_probeTranslation = pprobeTranslation; - m_gg = gg; - m_midpointprobedist = settings.ciseQTLAnalysMaxSNPProbeMidPointDistance; - m_cisOnly = (settings.cisAnalysis && !settings.transAnalysis); - - m_probeList = probelist; - m_maxResults = settings.maxNrMostSignificantEQTLs; - - int tmpbuffersize = (m_maxResults / 10); - - if (tmpbuffersize == 0) { - tmpbuffersize = 10; - } else if (tmpbuffersize > 250000) { - tmpbuffersize = 250000; - } + long nrZ = 0; + private boolean m_createBinaryFiles = false; + private TriTyperGeneticalGenomicsDataset[] m_gg = null; + private boolean m_cisOnly; + private IntMatrix2D m_probeTranslation; + private int m_midpointprobedist; + private final String m_outputdir; + private final boolean m_permuting; + private final int m_permutationround; + private final boolean m_createTEXTFiles; + private final String[] m_probeList; + private final LinkedBlockingQueue m_queue; + private final WorkPackage[] m_availableWorkPackages; + private long nrTestsPerformed = 0; + private QTL[] finalEQTLs; + private double maxSavedPvalue = -Double.MAX_VALUE; + private int locationToStoreResult = 0; + private boolean bufferHasOverFlown = false; + private boolean sorted = false; + private int m_maxResults = 0; + public double highestP = Double.MAX_VALUE; + private int nrSNPsTested = 0; + private final boolean m_useAbsoluteZScore; + private BinaryFile[] zScoreBinaryFile; + private BinaryFile zScoreMetaAnalysisFile; + private TextFile zScoreMetaAnalysisRowNamesFile; + private TextFile[] zScoreRowNamesFile; + + public ResultProcessorThread(int nrThreads, LinkedBlockingQueue queue, boolean chargeOutput, + TriTyperGeneticalGenomicsDataset[] gg, Settings settings, IntMatrix2D pprobeTranslation, + boolean permuting, int round, String[] snplist, String[] probelist, WorkPackage[] allPackages) { + m_availableWorkPackages = allPackages; + m_createBinaryFiles = settings.createBinaryOutputFiles; + m_createTEXTFiles = settings.createTEXTOutputFiles; + m_useAbsoluteZScore = settings.useAbsoluteZScorePValue; + m_queue = queue; + m_outputdir = settings.outputReportsDir; + m_permuting = permuting; + m_permutationround = round; + m_probeTranslation = pprobeTranslation; + m_gg = gg; + m_midpointprobedist = settings.ciseQTLAnalysMaxSNPProbeMidPointDistance; + m_cisOnly = (settings.cisAnalysis && !settings.transAnalysis); + + m_probeList = probelist; + m_maxResults = settings.maxNrMostSignificantEQTLs; + + int tmpbuffersize = (m_maxResults / 10); + + if (tmpbuffersize == 0) { + tmpbuffersize = 10; + } else if (tmpbuffersize > 250000) { + tmpbuffersize = 250000; + } // m_totalNumberOfProbes = probelist.length; // m_pvaluePlotThreshold = settings.plotOutputPValueCutOff; // tmpEQTLBuffer = new QTL[tmpbuffersize]; // m_result_counter = 0; // m_numdatasets = m_gg.length; - finalEQTLs = new QTL[(m_maxResults + tmpbuffersize)]; - nrSNPsTested = 0; - } + finalEQTLs = new QTL[(m_maxResults + tmpbuffersize)]; + nrSNPsTested = 0; + } - @Override - public void run() { + @Override + public void run() { // nrProcessed = 0; - try { - if (m_createBinaryFiles) { - zScoreBinaryFile = new BinaryFile[m_gg.length]; - zScoreRowNamesFile = new TextFile[m_gg.length]; - if (m_gg.length > 1) { - String metaAnalysisFileName = m_outputdir + "MetaAnalysis"; - if (m_permuting) { - metaAnalysisFileName += "-PermutationRound-" + m_permutationround; - } - zScoreMetaAnalysisFile = new BinaryFile(metaAnalysisFileName + ".dat", BinaryFile.W); - // write magic number - if (m_cisOnly) { - zScoreMetaAnalysisFile.writeInt(1); - } else { - zScoreMetaAnalysisFile.writeInt(0); - } - - zScoreMetaAnalysisRowNamesFile = new TextFile(metaAnalysisFileName + "-RowNames.txt.gz", TextFile.W); - zScoreMetaAnalysisRowNamesFile.writeln("SNP\tAlleles\tMinorAllele\tAlleleAssessed\tNrCalled"); - TextFile tf = new TextFile(metaAnalysisFileName + "-ColNames.txt.gz", TextFile.W); - tf.writeList(Arrays.asList(m_probeList)); - tf.close(); - } - for (int d = 0; d < m_gg.length; d++) { - String fileName = m_outputdir + m_gg[d].getSettings().name; - if (m_permuting) { - fileName += "-PermutationRound-" + m_permutationround; - } - zScoreBinaryFile[d] = new BinaryFile(fileName + ".dat", BinaryFile.W); - // write magic number - if (m_cisOnly) { - zScoreBinaryFile[d].writeInt(1); - } else { - zScoreBinaryFile[d].writeInt(0); - } - - TextFile tf = new TextFile(fileName + "-ColNames.txt.gz", TextFile.W); - tf.writeList(Arrays.asList(m_probeList)); - tf.close(); - zScoreRowNamesFile[d] = new TextFile(fileName + "-RowNames.txt.gz", TextFile.W); - zScoreRowNamesFile[d].writeln("SNP\tAlleles\tMinorAllele\tAlleleAssessed\tNrCalled\tMaf\tHWE\tCallRate"); - } - } - - ProgressBar progressbar = new ProgressBar(m_availableWorkPackages.length); - boolean poison = false; - - while (!poison) { - WorkPackage wp = m_queue.take(); - Result r = wp.results; - if (wp.getHasResults()) { - nrSNPsTested++; - } - - if (r.poison) { - poison = true; - } else if (r.pvalues != null) { - - nrTestsPerformed += wp.getNumTested(); - - double[] pvalues = r.pvalues; - - //Is this working? - if (m_createBinaryFiles && !poison) { - writeBinaryResult(r); - } - - if (m_createTEXTFiles && !poison) { - // classic textual output. - - for (int p = 0; p < pvalues.length; p++) { - double pval = pvalues[p]; - - if (!Double.isNaN(pval) && pval <= highestP) { - double[][] corr = r.correlations; - double[] correlations = new double[corr.length]; - double[] zscores = new double[corr.length]; - int[] samples = new int[corr.length]; - - double[] fc = new double[corr.length]; - double[] beta = new double[corr.length]; - double[] betase = new double[corr.length]; - - for (int d = 0; d < correlations.length; d++) { - if (Double.isNaN(corr[d][p])) { - correlations[d] = Double.NaN; - zscores[d] = Double.NaN; - samples[d] = -9; - fc[d] = Double.NaN; - beta[d] = Double.NaN; - betase[d] = Double.NaN; - } else { - correlations[d] = corr[d][p]; - if (m_useAbsoluteZScore) { - zscores[d] = Math.abs(r.zscores[d][p]); - } else { - zscores[d] = r.zscores[d][p]; - } - - samples[d] = r.numSamples[d]; - fc[d] = r.fc[d][p]; - beta[d] = r.beta[d][p]; - betase[d] = r.se[d][p]; - } - } + try { + if (m_createBinaryFiles) { + zScoreBinaryFile = new BinaryFile[m_gg.length]; + zScoreRowNamesFile = new TextFile[m_gg.length]; + if (m_gg.length > 1) { + String metaAnalysisFileName = m_outputdir + "MetaAnalysis"; + if (m_permuting) { + metaAnalysisFileName += "-PermutationRound-" + m_permutationround; + } + zScoreMetaAnalysisFile = new BinaryFile(metaAnalysisFileName + ".dat", BinaryFile.W); + // write magic number + if (m_cisOnly) { + zScoreMetaAnalysisFile.writeInt(1); + } else { + zScoreMetaAnalysisFile.writeInt(0); + } + + zScoreMetaAnalysisRowNamesFile = new TextFile(metaAnalysisFileName + "-RowNames.txt.gz", TextFile.W); + zScoreMetaAnalysisRowNamesFile.writeln("SNP\tAlleles\tMinorAllele\tAlleleAssessed\tNrCalled"); + TextFile tf = new TextFile(metaAnalysisFileName + "-ColNames.txt.gz", TextFile.W); + tf.writeList(Arrays.asList(m_probeList)); + tf.close(); + } + for (int d = 0; d < m_gg.length; d++) { + String fileName = m_outputdir + m_gg[d].getSettings().name; + if (m_permuting) { + fileName += "-PermutationRound-" + m_permutationround; + } + zScoreBinaryFile[d] = new BinaryFile(fileName + ".dat", BinaryFile.W); + // write magic number + if (m_cisOnly) { + zScoreBinaryFile[d].writeInt(1); + } else { + zScoreBinaryFile[d].writeInt(0); + } + + TextFile tf = new TextFile(fileName + "-ColNames.txt.gz", TextFile.W); + tf.writeList(Arrays.asList(m_probeList)); + tf.close(); + zScoreRowNamesFile[d] = new TextFile(fileName + "-RowNames.txt.gz", TextFile.W); + zScoreRowNamesFile[d].writeln("SNP\tAlleles\tMinorAllele\tAlleleAssessed\tNrCalled\tMaf\tHWE\tCallRate"); + } + } + + ProgressBar progressbar = new ProgressBar(m_availableWorkPackages.length); + boolean poison = false; + + while (!poison) { + WorkPackage wp = m_queue.take(); + Result r = wp.results; + if (wp.getHasResults()) { + nrSNPsTested++; + } + + if (r.poison) { + poison = true; + } else if (r.pvalues != null) { + + nrTestsPerformed += wp.getNumTested(); + + double[] pvalues = r.pvalues; + + //Is this working? + if (m_createBinaryFiles && !poison) { + writeBinaryResult(r); + } + + if (m_createTEXTFiles && !poison) { + // classic textual output. + + for (int p = 0; p < pvalues.length; p++) { + double pval = pvalues[p]; + + if (!Double.isNaN(pval) && pval <= highestP) { + double[][] corr = r.correlations; + double[] correlations = new double[corr.length]; + double[] zscores = new double[corr.length]; + int[] samples = new int[corr.length]; + + double[] fc = new double[corr.length]; + double[] beta = new double[corr.length]; + double[] betase = new double[corr.length]; + + for (int d = 0; d < correlations.length; d++) { + if (Double.isNaN(corr[d][p])) { + correlations[d] = Double.NaN; + zscores[d] = Double.NaN; + samples[d] = -9; + fc[d] = Double.NaN; + beta[d] = Double.NaN; + betase[d] = Double.NaN; + } else { + correlations[d] = corr[d][p]; + if (m_useAbsoluteZScore) { + zscores[d] = Math.abs(r.zscores[d][p]); + } else { + zscores[d] = r.zscores[d][p]; + } + + samples[d] = r.numSamples[d]; + fc[d] = r.fc[d][p]; + beta[d] = r.beta[d][p]; + betase[d] = r.se[d][p]; + } + } // - byte allele = -1; - byte[] alleles = null; - SNP[] snps = wp.getSnps(); - for (int d = 0; d < snps.length; d++) { - if (snps[d] != null) { - allele = snps[d].getMinorAllele(); - alleles = snps[d].getAlleles(); - break; - } - } + byte allele = -1; + byte[] alleles = null; + SNP[] snps = wp.getSnps(); + for (int d = 0; d < snps.length; d++) { + if (snps[d] != null) { + allele = snps[d].getMinorAllele(); + alleles = snps[d].getAlleles(); + break; + } + } + + if (alleles == null) { + System.err.println("SNP has null alleles: "); + for (int d = 0; d < snps.length; d++) { + + if (snps[d] != null) { + + allele = snps[d].getMinorAllele(); + System.err.println(allele); + alleles = snps[d].getAlleles(); + System.err.println(alleles); + break; + } + } + } + + double Zfinal = r.finalZScore[p]; + double finalbeta = r.finalBeta[p]; + double finalbetase = r.finalBetaSe[p]; + int pid; + if (m_cisOnly) { + pid = wp.getProbes()[p]; + } else { + pid = p; + } + + addEQTL(pid, wp.getId(), pval, Zfinal, correlations, zscores, samples, alleles, allele, fc, beta, betase, finalbeta, finalbetase); + + } + } + } + + } + + if (wp.results != null) { + wp.clearResults(); - if (alleles == null) { - System.err.println("SNP has null alleles: "); - for (int d = 0; d < snps.length; d++) { + } - if (snps[d] != null) { + progressbar.iterate(); + } - allele = snps[d].getMinorAllele(); - System.err.println(allele); - alleles = snps[d].getAlleles(); - System.err.println(alleles); - break; - } - } - } + progressbar.close(); - double Zfinal = r.finalZScore[p]; - double finalbeta = r.finalBeta[p]; - double finalbetase = r.finalBetaSe[p]; - int pid; - if (m_cisOnly) { - pid = wp.getProbes()[p]; - } else { - pid = p; - } + //Is this working? + if (m_createBinaryFiles) { - addEQTL(pid, wp.getId(), pval, Zfinal, correlations, zscores, samples, alleles, allele, fc, beta, betase, finalbeta, finalbetase); + String fileName = "check"; + if (m_permuting) { + fileName += "-PermutationRound-" + m_permutationround; + } + fileName += ".md5"; - } - } - } + HexBinaryAdapter md5Parser = new HexBinaryAdapter(); - } + BufferedWriter md5writer = new BufferedWriter(new FileWriter(m_outputdir + fileName)); - if (wp.results != null) { - wp.clearResults(); + for (int d = 0; d < m_gg.length; d++) { + zScoreBinaryFile[d].close(); - } + fileName = m_gg[d].getSettings().name; + if (m_permuting) { + fileName += "-PermutationRound-" + m_permutationround; + } + fileName += ".dat"; + md5writer.write(md5Parser.marshal(zScoreBinaryFile[d].getWrittenHash()) + " " + fileName + '\n'); - progressbar.iterate(); - } + zScoreRowNamesFile[d].close(); + } + if (m_gg.length > 1) { + zScoreMetaAnalysisFile.close(); - progressbar.close(); + fileName = "MetaAnalysis"; + if (m_permuting) { + fileName += "-PermutationRound-" + m_permutationround; + } + fileName += ".dat"; + md5writer.write(md5Parser.marshal(zScoreMetaAnalysisFile.getWrittenHash()) + " " + fileName + '\n'); + zScoreMetaAnalysisRowNamesFile.close(); + } + md5writer.close(); + } - //Is this working? - if (m_createBinaryFiles) { + if (m_createTEXTFiles) { + if (!sorted) { + if (locationToStoreResult != 0) { - String fileName = "check"; - if (m_permuting) { - fileName += "-PermutationRound-" + m_permutationround; - } - fileName += ".md5"; - - HexBinaryAdapter md5Parser = new HexBinaryAdapter(); + Arrays.sort(finalEQTLs, 0, locationToStoreResult); +// SmoothSort.sort(finalEQTLs, 0, locationToStoreResult); +// inplaceArrayQuickSort.sort(finalEQTLs, 0, locationToStoreResult); - BufferedWriter md5writer = new BufferedWriter(new FileWriter(m_outputdir + fileName)); + } + } + writeTextResults(); + } - for (int d = 0; d < m_gg.length; d++) { - zScoreBinaryFile[d].close(); + } catch (IOException e1) { + e1.printStackTrace(); + } catch (InterruptedException e2) { + e2.printStackTrace(); + } + } - fileName = m_gg[d].getSettings().name; - if (m_permuting) { - fileName += "-PermutationRound-" + m_permutationround; - } - fileName += ".dat"; - md5writer.write(md5Parser.marshal(zScoreBinaryFile[d].getWrittenHash()) + " " + fileName + '\n'); + private void writeBinaryResult(Result r) throws IOException { - zScoreRowNamesFile[d].close(); - } - if (m_gg.length > 1) { - zScoreMetaAnalysisFile.close(); + if (r != null) { + int[] numSamples = null; + try { + numSamples = r.numSamples; + } catch (NullPointerException e) { + System.out.println("ERROR: null result?"); + } - fileName = "MetaAnalysis"; - if (m_permuting) { - fileName += "-PermutationRound-" + m_permutationround; - } - fileName += ".dat"; - md5writer.write(md5Parser.marshal(zScoreMetaAnalysisFile.getWrittenHash()) + " " + fileName + '\n'); + int wpId = r.wpid; + WorkPackage currentWP = m_availableWorkPackages[wpId]; + double[][] zscores = r.zscores; + + if (zscores != null) { + SNP[] snps = currentWP.getSnps(); + int numDatasets = zscores.length; + double[] finalZscores = r.finalZScore; + StringBuilder snpoutput = null; + + // if we're doing a meta-analysis, write the meta-analysis Z to a separate binaryFile + if (m_gg.length > 1) { + int totalSampleNr = 0; + String snpname = null; + for (int d = 0; d < numDatasets; d++) { + if (snps[d] != null) { + snpname = snps[d].getName(); + + byte[] alleles = snps[d].getAlleles(); + byte minorAllele = snps[d].getMinorAllele(); + byte alleleassessed = alleles[1]; + + if (currentWP.getFlipSNPAlleles()[d]) { + alleleassessed = alleles[0]; + } + if (snpoutput == null) { + snpoutput = new StringBuilder(); + snpoutput.append(snpname); + snpoutput.append("\t"); + snpoutput.append(BaseAnnot.getAllelesDescription(alleles)); + snpoutput.append("\t"); + snpoutput.append(BaseAnnot.toString(minorAllele)); + snpoutput.append("\t"); + snpoutput.append(BaseAnnot.toString(alleleassessed)); + } + totalSampleNr += r.numSamples[d]; + } + } + + StringBuilder sb = null; + for (int p = 0; p < finalZscores.length; p++) { + float z = (float) finalZscores[p]; + if (m_cisOnly) { + int[] probes = currentWP.getProbes(); + int probeId = probes[p]; + String probeName = m_probeList[probeId]; + if (sb == null) { + sb = new StringBuilder(); + } else { + sb.append("\t"); + } + sb.append(probeName); + + zScoreMetaAnalysisFile.writeFloat(z); + } else { + zScoreMetaAnalysisFile.writeFloat(z); + } + } + + if (snpoutput != null) { + snpoutput.append("\t"); + snpoutput.append(totalSampleNr); + snpoutput.append("\t-\t-\t-\t"); + snpoutput.append(finalZscores.length); + snpoutput.append("\t"); + if (sb != null) { + snpoutput.append(sb.toString()); + } else { + snpoutput.append("-"); + } + zScoreMetaAnalysisRowNamesFile.writeln(snpoutput.toString()); + } + } - zScoreMetaAnalysisRowNamesFile.close(); - } + for (int d = 0; d < numDatasets; d++) { + double[] datasetZScores = zscores[d]; + SNP datasetSNP = snps[d]; + if (datasetSNP != null) { + BinaryFile outfile = zScoreBinaryFile[d]; + + String snpname = datasetSNP.getName(); + + byte[] alleles = datasetSNP.getAlleles(); + byte minorAllele = datasetSNP.getMinorAllele(); + byte alleleassessed = alleles[1]; + double hwe = datasetSNP.getHWEP(); + double cr = datasetSNP.getCR(); + double maf = datasetSNP.getMAF(); + + if (currentWP.getFlipSNPAlleles()[d]) { + alleleassessed = alleles[0]; + } + TextFile snpfile = zScoreRowNamesFile[d]; + StringBuilder sb = null; + for (int p = 0; p < datasetZScores.length; p++) { + float z = (float) datasetZScores[p]; + if (currentWP.getFlipSNPAlleles()[d]) { + z *= -1; + } + // System.out.println(p + "\t" + alleleassessed + "\t" + m_probeList[p] + "\t" + z + "\t" + currentWP.getFlipSNPAlleles()[d]); + if (m_cisOnly) { + // take into account that not all probes have been tested.. + int[] probes = currentWP.getProbes(); + int probeId = probes[p]; + String probeName = m_probeList[probeId]; + outfile.writeFloat(z); + if (sb == null) { + sb = new StringBuilder(); + } else { + sb.append("\t"); + } + sb.append(probeName); + } else { + outfile.writeFloat(z); + } + } + + StringBuilder buffer = new StringBuilder(); + buffer.append(snpname) + .append("\t") + .append(BaseAnnot.getAllelesDescription(alleles)) + .append("\t") + .append(BaseAnnot.toString(minorAllele)) + .append("\t") + .append(BaseAnnot.toString(alleleassessed)) + .append("\t") + .append(datasetSNP.getNrCalled()) + .append("\t") + .append(maf) + .append("\t") + .append(hwe) + .append("\t") + .append(cr) + .append("\t") + .append(datasetZScores.length) + .append("\t"); + if (sb != null) { + buffer.append(sb.toString()); + } else { + buffer.append("-"); + } + + snpfile.writeln(buffer.toString()); + + } + } + } + } + } - md5writer.close(); + private void addEQTL(int pid, int sid, double pval, double zscore, double[] correlations, double[] zscores, int[] numSamples, byte[] alleles, byte assessedAllele, double[] fc, double[] beta, double[] betase, double finalbeta, double finalbetase) { - } + if (bufferHasOverFlown) { + if (pval <= maxSavedPvalue) { + sorted = false; - if (m_createTEXTFiles) { - if (!sorted) { - if (locationToStoreResult != 0) { + finalEQTLs[locationToStoreResult] = new QTL(pval, pid, sid, assessedAllele, zscore, alleles, zscores, numSamples, correlations, fc, beta, betase, finalbeta, finalbetase); + locationToStoreResult++; - Arrays.sort(finalEQTLs, 0, locationToStoreResult); -// SmoothSort.sort(finalEQTLs, 0, locationToStoreResult); -// inplaceArrayQuickSort.sort(finalEQTLs, 0, locationToStoreResult); + if (locationToStoreResult == finalEQTLs.length) { - } - } - writeTextResults(); - } - - } catch (IOException e1) { - e1.printStackTrace(); - } catch (InterruptedException e2) { - e2.printStackTrace(); - } - } - - private void writeBinaryResult(Result r) throws IOException { - - if (r != null) { - int[] numSamples = null; - try { - numSamples = r.numSamples; - } catch (NullPointerException e) { - System.out.println("ERROR: null result?"); - } - - int wpId = r.wpid; - WorkPackage currentWP = m_availableWorkPackages[wpId]; - double[][] zscores = r.zscores; - - if (zscores != null) { - SNP[] snps = currentWP.getSnps(); - int numDatasets = zscores.length; - double[] finalZscores = r.finalZScore; - String snpoutput = null; - - // if we're doing a meta-analysis, write the meta-analysis Z to a separate binaryFile - if (m_gg.length > 1) { - int totalSampleNr = 0; - String snpname = null; - for (int d = 0; d < numDatasets; d++) { - if (snps[d] != null) { - snpname = snps[d].getName(); - - byte[] alleles = snps[d].getAlleles(); - byte minorAllele = snps[d].getMinorAllele(); - byte alleleassessed = alleles[1]; - - if (currentWP.getFlipSNPAlleles()[d]) { - alleleassessed = alleles[0]; - } - if (snpoutput == null) { - snpoutput = snpname + "\t" + BaseAnnot.getAllelesDescription(alleles) + "\t" + BaseAnnot.toString(minorAllele) + "\t" + BaseAnnot.toString(alleleassessed); - } - totalSampleNr += r.numSamples[d]; - } - } - - StringBuilder sb = null; - for (int p = 0; p < finalZscores.length; p++) { - float z = (float) finalZscores[p]; - if (m_cisOnly) { - int[] probes = currentWP.getProbes(); - int probeId = probes[p]; - String probeName = m_probeList[probeId]; - if (sb == null) { - sb = new StringBuilder(); - } else { - sb.append("\t"); - } - sb.append(probeName); - - zScoreMetaAnalysisFile.writeFloat(z); - } else { - zScoreMetaAnalysisFile.writeFloat(z); - } - } - - if (sb != null) { - zScoreMetaAnalysisRowNamesFile.writeln(snpoutput + "\t" + totalSampleNr + "\t-\t-\t-\t" + finalZscores.length + "\t" + sb.toString()); - } else { - zScoreMetaAnalysisRowNamesFile.writeln(snpoutput + "\t" + totalSampleNr + "\t-\t-\t-\t" + finalZscores.length + "\t-"); - } - } - for (int d = 0; d < numDatasets; d++) { - double[] datasetZScores = zscores[d]; - SNP datasetSNP = snps[d]; - if (datasetSNP != null) { - BinaryFile outfile = zScoreBinaryFile[d]; - - String snpname = datasetSNP.getName(); - - byte[] alleles = datasetSNP.getAlleles(); - byte minorAllele = datasetSNP.getMinorAllele(); - byte alleleassessed = alleles[1]; - double hwe = datasetSNP.getHWEP(); - double cr = datasetSNP.getCR(); - double maf = datasetSNP.getMAF(); - - if (currentWP.getFlipSNPAlleles()[d]) { - alleleassessed = alleles[0]; - } - TextFile snpfile = zScoreRowNamesFile[d]; - StringBuilder sb = null; - for (int p = 0; p < datasetZScores.length; p++) { - float z = (float) datasetZScores[p]; - if (currentWP.getFlipSNPAlleles()[d]) { - z *= -1; - } - // System.out.println(p + "\t" + alleleassessed + "\t" + m_probeList[p] + "\t" + z + "\t" + currentWP.getFlipSNPAlleles()[d]); - if (m_cisOnly) { - // take into account that not all probes have been tested.. - int[] probes = currentWP.getProbes(); - int probeId = probes[p]; - String probeName = m_probeList[probeId]; - outfile.writeFloat(z); - if (sb == null) { - sb = new StringBuilder(); - } else { - sb.append("\t"); - } - sb.append(probeName); - } else { - outfile.writeFloat(z); - } - } - - if (sb != null) { - snpfile.writeln(snpname + "\t" + BaseAnnot.getAllelesDescription(alleles) + "\t" + BaseAnnot.toString(minorAllele) + "\t" + BaseAnnot.toString(alleleassessed) + "\t" + datasetSNP.getNrCalled() + "\t" + maf + "\t" + hwe + "\t" + cr + "\t" + datasetZScores.length + "\t" + sb.toString()); - } else { - snpfile.writeln(snpname + "\t" + BaseAnnot.getAllelesDescription(alleles) + "\t" + BaseAnnot.toString(minorAllele) + "\t" + BaseAnnot.toString(alleleassessed) + "\t" + datasetSNP.getNrCalled() + "\t" + maf + "\t" + hwe + "\t" + cr + "\t" + datasetZScores.length + "\t-"); - } - - } - } - } - } - } - - private void addEQTL(int pid, int sid, double pval, double zscore, double[] correlations, double[] zscores, int[] numSamples, byte[] alleles, byte assessedAllele, double[] fc, double[] beta, double[] betase, double finalbeta, double finalbetase) { - - if (bufferHasOverFlown) { - if (pval <= maxSavedPvalue) { - - sorted = false; - - finalEQTLs[locationToStoreResult] = new QTL(pval, pid, sid, assessedAllele, zscore, alleles, zscores, numSamples, correlations, fc, beta, betase, finalbeta, finalbetase); - locationToStoreResult++; - - if (locationToStoreResult == finalEQTLs.length) { - - Arrays.sort(finalEQTLs); + Arrays.sort(finalEQTLs); // SmoothSort.sort(finalEQTLs); // inplaceArrayQuickSort.sort(finalEQTLs); sorted = true; @@ -507,8 +540,6 @@ private void writeTextResults() throws IOException { System.out.println("Writing " + nrOfEntriesToWrite + " results out of " + nrTestsPerformed + " tests performed. " + nrSNPsTested + " SNPs finally tested."); - - if (m_permuting) { TextFile gz = new TextFile((m_outputdir + "PermutedEQTLsPermutationRound" + m_permutationround + ".txt.gz"), TextFile.W); gz.writeln("PValue\tSNP\tProbe\tGene\tAlleles\tAlleleAssessed\tZScore"); diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/graphics/EQTLDotPlot.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/graphics/EQTLDotPlot.java index a03f468ab..e154c3008 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/graphics/EQTLDotPlot.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/graphics/EQTLDotPlot.java @@ -4,7 +4,7 @@ */ package eqtlmappingpipeline.metaqtl3.graphics; -import com.lowagie.text.DocumentException; +import com.itextpdf.text.DocumentException; import java.awt.Color; import java.awt.Graphics2D; import java.awt.RenderingHints; @@ -61,19 +61,19 @@ public void draw(String inputFile, String outputFile, Output output) throws IOEx int innerHeight = y1 - y0; Graphics2D g2d = null; - com.lowagie.text.Document document = null; - com.lowagie.text.pdf.PdfWriter writer = null; - com.lowagie.text.pdf.PdfContentByte cb = null; + com.itextpdf.text.Document document = null; + com.itextpdf.text.pdf.PdfWriter writer = null; + com.itextpdf.text.pdf.PdfContentByte cb = null; BufferedImage bi = null; if (output == Output.PDF) { - com.lowagie.text.Rectangle rectangle = new com.lowagie.text.Rectangle(width, height); - document = new com.lowagie.text.Document(rectangle); - writer = com.lowagie.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(outputFile)); + com.itextpdf.text.Rectangle rectangle = new com.itextpdf.text.Rectangle(width, height); + document = new com.itextpdf.text.Document(rectangle); + writer = com.itextpdf.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(outputFile)); document.open(); cb = writer.getDirectContent(); cb.saveState(); - //com.lowagie.text.pdf.DefaultFontMapper fontMap = new com.lowagie.text.pdf.DefaultFontMapper(); + //com.itextpdf.text.pdf.DefaultFontMapper fontMap = new com.itextpdf.text.pdf.DefaultFontMapper(); g2d = cb.createGraphics(width, height); } else { bi = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB); diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/graphics/EQTLPlotter.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/graphics/EQTLPlotter.java index 63a52c1ee..5326dc27d 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/graphics/EQTLPlotter.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/graphics/EQTLPlotter.java @@ -164,18 +164,18 @@ public void draw(WorkPackage wp, int pid) { Graphics2D g2d = null; BufferedImage bi = null; - com.lowagie.text.Document document = null; - com.lowagie.text.pdf.PdfContentByte cb = null; - com.lowagie.text.pdf.PdfWriter writer = null; + com.itextpdf.text.Document document = null; + com.itextpdf.text.pdf.PdfContentByte cb = null; + com.itextpdf.text.pdf.PdfWriter writer = null; if (outputPlotsFileType == FILE_TYPE_PNG) { bi = new java.awt.image.BufferedImage(width, height, java.awt.image.BufferedImage.TYPE_INT_RGB); g2d = bi.createGraphics(); } else { - com.lowagie.text.Rectangle rectangle = new com.lowagie.text.Rectangle(width, height); - document = new com.lowagie.text.Document(rectangle); + com.itextpdf.text.Rectangle rectangle = new com.itextpdf.text.Rectangle(width, height); + document = new com.itextpdf.text.Document(rectangle); try { - writer = com.lowagie.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(file)); + writer = com.itextpdf.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(file)); document.open(); cb = writer.getDirectContent(); cb.saveState(); diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/graphics/QQPlot.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/graphics/QQPlot.java index 6c2bf34bd..ee0af840f 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/graphics/QQPlot.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/metaqtl3/graphics/QQPlot.java @@ -44,19 +44,19 @@ public void draw(String fileName, double fdrCutOff, int nrPermutationsFDR, int m } Graphics2D g2d = null; BufferedImage bi = null; - com.lowagie.text.Document document = null; - com.lowagie.text.pdf.PdfContentByte cb = null; - com.lowagie.text.pdf.PdfWriter writer = null; + com.itextpdf.text.Document document = null; + com.itextpdf.text.pdf.PdfContentByte cb = null; + com.itextpdf.text.pdf.PdfWriter writer = null; if (outputPlotsFileType == FILE_TYPE_PNG) { bi = new java.awt.image.BufferedImage(width, height, java.awt.image.BufferedImage.TYPE_INT_RGB); g2d = bi.createGraphics(); } else { - com.lowagie.text.Rectangle rectangle = new com.lowagie.text.Rectangle(width, height); + com.itextpdf.text.Rectangle rectangle = new com.itextpdf.text.Rectangle(width, height); - document = new com.lowagie.text.Document(rectangle); + document = new com.itextpdf.text.Document(rectangle); try { - writer = com.lowagie.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(fileQQPlot)); + writer = com.itextpdf.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(fileQQPlot)); document.open(); cb = writer.getDirectContent(); cb.saveState(); diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/normalization/Normalizer.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/normalization/Normalizer.java index 5ccf49e0b..0e59f10e6 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/normalization/Normalizer.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/normalization/Normalizer.java @@ -1,11 +1,5 @@ package eqtlmappingpipeline.normalization; -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Set; import org.apache.commons.math3.stat.ranking.NaNStrategy; import org.apache.commons.math3.stat.ranking.NaturalRanking; import org.apache.commons.math3.stat.ranking.TiesStrategy; @@ -15,8 +9,8 @@ import umcg.genetica.io.text.TextFile; import umcg.genetica.math.PCA; import umcg.genetica.math.matrix.DoubleMatrixDataset; -import umcg.genetica.math.matrix.MatrixTools; import umcg.genetica.math.matrix.MatrixHandling; +import umcg.genetica.math.matrix.MatrixTools; import umcg.genetica.math.stats.Descriptives; import umcg.genetica.math.stats.Log2Transform; import umcg.genetica.math.stats.QuantileNormalization; @@ -25,340 +19,350 @@ import umcg.genetica.math.stats.concurrent.ConcurrentCovariation; import umcg.genetica.methylation.ConvertBetaAndMvalues; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + /** - * * @author harmjan */ public class Normalizer { - //nrIntermediatePCAsOverSamplesToRemoveToOutput = 5 - //nrPCAsOverSamplesToRemove = 100 - public void normalize(String expressionFile, String probeIncludeList, String sampleIncludeList, int nrPCAsOverSamplesToRemove, int nrIntermediatePCAsOverSamplesToRemoveToOutput, String covariatesToRemove, boolean orthogonalizecovariates, String outdir, - boolean runQQNorm, boolean runLog2Transform, boolean runMTransform, boolean runCenterScale, boolean runPCA, boolean adjustCovariates, boolean forceMissingValues, boolean forceReplacementOfMissingValues, - boolean forceReplacementOfMissingValues2, boolean treatZerosAsNulls, boolean forceNormalDistribution) throws IOException { - - System.out.println("Running normalization."); - if (outdir != null) { - outdir = Gpio.formatAsDirectory(outdir); - Gpio.createDir(outdir); - } else { - if (Gpio.getParentDir(expressionFile) == null) { - //This happens for relative paths in current dir - outdir = ""; - } else { - outdir = Gpio.getParentDir(expressionFile) + Gpio.getFileSeparator(); - } - - } - - String parentDir = Gpio.getParentDir(expressionFile); - String expressionFileName = Gpio.getFileName(expressionFile); - if (parentDir == null) { - parentDir = ""; - } - - if (expressionFileName.contains(".txt.gz")) { - expressionFileName = expressionFileName.replaceAll(".txt.gz", ""); - } else { - expressionFileName = expressionFileName.replaceAll(".txt", ""); - } - - String outputFileNamePrefix = outdir + expressionFileName; - - - Set s = null; - if(sampleIncludeList != null){ - TextFile t = new TextFile(sampleIncludeList, TextFile.R); - s = new HashSet(t.readAsArrayList()); - } - Set p = null; - if(probeIncludeList != null){ - TextFile t = new TextFile(probeIncludeList, TextFile.R); - p = new HashSet(t.readAsArrayList()); - } - DoubleMatrixDataset dataset = null; - - if(s != null || p!=null){ - dataset = new DoubleMatrixDataset(expressionFile, p, s); - //Check if samples are correclty loaded. - boolean breakAfterCheck = false; - if(s!=null){ - outputFileNamePrefix = outputFileNamePrefix + ".SampleSelection"; - HashSet tmpNames = new HashSet(); - tmpNames.addAll(dataset.colObjects); - tmpNames.addAll(s); - HashSet missingNames = new HashSet(); - HashSet extraNames = new HashSet(); - for(String colName : tmpNames){ - if(!s.contains(colName)){ - extraNames.add(colName); - } - if(!dataset.colObjects.contains(colName)) { - missingNames.add(colName); - } - } - if(!missingNames.isEmpty()){ - System.err.println("\nMatrix does not contains desired columns, please check filtering list."); - System.err.println(missingNames.toString()+"\n"); - breakAfterCheck = true; - } else if(!extraNames.isEmpty()){ - System.err.println("\nMatrix contains unwanted columns, please check filtering list."); - System.err.println(extraNames.toString()+"\n"); - breakAfterCheck = true; - } - } - //Check if probes are correclty loaded. - if(p!=null){ - outputFileNamePrefix = outputFileNamePrefix + ".ProbeSelection"; - HashSet tmpNames = new HashSet(); - tmpNames.addAll(dataset.rowObjects); - tmpNames.addAll(p); - HashSet missingNames = new HashSet(); - HashSet extraNames = new HashSet(); - for(String rowName : tmpNames){ - if(!p.contains(rowName)){ - extraNames.add(rowName); - } - if(!dataset.rowObjects.contains(rowName)) { - missingNames.add(rowName); - } - } - if(!missingNames.isEmpty()){ - System.err.println("\nMatrix does not contains desired rows, please check filtering list."); - System.err.println(missingNames.toString()+"\n"); - breakAfterCheck = true; - } else if(!extraNames.isEmpty()){ - System.err.println("\nMatrix contains unwanted rows, please check filtering list."); - System.err.println(extraNames.toString()+"\n"); - breakAfterCheck = true; - } - } - + //nrIntermediatePCAsOverSamplesToRemoveToOutput = 5 + //nrPCAsOverSamplesToRemove = 100 + public void normalize(String expressionFile, String probeIncludeList, String sampleIncludeList, int nrPCAsOverSamplesToRemove, int nrIntermediatePCAsOverSamplesToRemoveToOutput, String covariatesToRemove, boolean orthogonalizecovariates, String outdir, + boolean runQQNorm, boolean runLog2Transform, boolean runMTransform, boolean runCenterScale, boolean runPCA, boolean adjustCovariates, boolean forceMissingValues, boolean forceReplacementOfMissingValues, + boolean forceReplacementOfMissingValues2, boolean treatZerosAsNulls, boolean forceNormalDistribution) throws IOException { + + System.out.println("Running normalization."); + if (outdir != null) { + outdir = Gpio.formatAsDirectory(outdir); + Gpio.createDir(outdir); + } else { + if (Gpio.getParentDir(expressionFile) == null) { + //This happens for relative paths in current dir + outdir = ""; + } else { + outdir = Gpio.getParentDir(expressionFile) + Gpio.getFileSeparator(); + } + + } + + String parentDir = Gpio.getParentDir(expressionFile); + String expressionFileName = Gpio.getFileName(expressionFile); + if (parentDir == null) { + parentDir = ""; + } + + if (expressionFileName.contains(".txt.gz")) { + expressionFileName = expressionFileName.replaceAll(".txt.gz", ""); + } else { + expressionFileName = expressionFileName.replaceAll(".txt", ""); + } + + String outputFileNamePrefix = outdir + expressionFileName; + + + Set s = null; + if (sampleIncludeList != null) { + TextFile t = new TextFile(sampleIncludeList, TextFile.R); + s = new HashSet(t.readAsArrayList()); + } + Set p = null; + if (probeIncludeList != null) { + TextFile t = new TextFile(probeIncludeList, TextFile.R); + p = new HashSet(t.readAsArrayList()); + } + DoubleMatrixDataset dataset = null; + + if (s != null || p != null) { + dataset = new DoubleMatrixDataset(expressionFile, p, s); + //Check if samples are correclty loaded. + boolean breakAfterCheck = false; + if (s != null) { + outputFileNamePrefix = outputFileNamePrefix + ".SampleSelection"; + HashSet tmpNames = new HashSet(); + tmpNames.addAll(dataset.colObjects); + tmpNames.addAll(s); + HashSet missingNames = new HashSet(); + HashSet extraNames = new HashSet(); + for (String colName : tmpNames) { + if (!s.contains(colName)) { + extraNames.add(colName); + } + if (!dataset.colObjects.contains(colName)) { + missingNames.add(colName); + } + } + if (!missingNames.isEmpty()) { + System.err.println("\nMatrix does not contains desired columns, please check filtering list."); + System.err.println(missingNames.toString() + "\n"); + breakAfterCheck = true; + } else if (!extraNames.isEmpty()) { + System.err.println("\nMatrix contains unwanted columns, please check filtering list."); + System.err.println(extraNames.toString() + "\n"); + breakAfterCheck = true; + } + } + //Check if probes are correclty loaded. + if (p != null) { + outputFileNamePrefix = outputFileNamePrefix + ".ProbeSelection"; + HashSet tmpNames = new HashSet(); + tmpNames.addAll(dataset.rowObjects); + tmpNames.addAll(p); + HashSet missingNames = new HashSet(); + HashSet extraNames = new HashSet(); + for (String rowName : tmpNames) { + if (!p.contains(rowName)) { + extraNames.add(rowName); + } + if (!dataset.rowObjects.contains(rowName)) { + missingNames.add(rowName); + } + } + if (!missingNames.isEmpty()) { + System.err.println("\nMatrix does not contains desired rows, please check filtering list."); + System.err.println(missingNames.toString() + "\n"); + breakAfterCheck = true; + } else if (!extraNames.isEmpty()) { + System.err.println("\nMatrix contains unwanted rows, please check filtering list."); + System.err.println(extraNames.toString() + "\n"); + breakAfterCheck = true; + } + } + // if(breakAfterCheck){ // System.exit(-1); // } - - dataset.save(outputFileNamePrefix + ".txt.gz"); - } else { - dataset = new DoubleMatrixDataset(expressionFile); - } - - - // check for probes with zero variance, if there > 3 samples in the dataset - if (dataset.nrCols > 3) { - outputFileNamePrefix = removeProbesWithZeroVariance(dataset, outputFileNamePrefix); - } - - if (runQQNorm) { - outputFileNamePrefix = quantileNormalize(dataset, outputFileNamePrefix, forceMissingValues, forceReplacementOfMissingValues, forceReplacementOfMissingValues2, treatZerosAsNulls); - } - if (runLog2Transform) { - outputFileNamePrefix = log2transform(dataset, outputFileNamePrefix); - } - if (runMTransform) { - outputFileNamePrefix = mValueTransform(dataset, outputFileNamePrefix); - } - if (runCenterScale) { - outputFileNamePrefix = centerAndScale(dataset, outputFileNamePrefix); - } - - if (adjustCovariates && covariatesToRemove != null) { - outputFileNamePrefix = adjustCovariates(dataset, outputFileNamePrefix, covariatesToRemove, orthogonalizecovariates, 1E-10); - } - - if (runPCA) { - ConcurrentCorrelation c = new ConcurrentCorrelation(2); - double[][] correlationMatrix = c.pairwiseCorrelation(dataset.getRawDataTransposed()); - Pair, DoubleMatrixDataset> PCAResults = calculatePCA(dataset, correlationMatrix, outputFileNamePrefix, null); - if(nrPCAsOverSamplesToRemove != 0 || nrIntermediatePCAsOverSamplesToRemoveToOutput != 0){ - correctDataForPCs(dataset, outputFileNamePrefix, nrPCAsOverSamplesToRemove, nrIntermediatePCAsOverSamplesToRemoveToOutput, PCAResults.getLeft(), PCAResults.getRight()); - } - } - - if(forceNormalDistribution){ + + dataset.save(outputFileNamePrefix + ".txt.gz"); + } else { + dataset = new DoubleMatrixDataset(expressionFile); + } + + + // check for probes with zero variance, if there > 3 samples in the dataset + if (dataset.nrCols > 3) { + outputFileNamePrefix = removeProbesWithZeroVariance(dataset, outputFileNamePrefix); + } + + if (runQQNorm) { + outputFileNamePrefix = quantileNormalize(dataset, outputFileNamePrefix, forceMissingValues, forceReplacementOfMissingValues, forceReplacementOfMissingValues2, treatZerosAsNulls); + } + if (runLog2Transform) { + outputFileNamePrefix = log2transform(dataset, outputFileNamePrefix); + } + if (runMTransform) { + outputFileNamePrefix = mValueTransform(dataset, outputFileNamePrefix); + } + if (runCenterScale) { + outputFileNamePrefix = centerAndScale(dataset, outputFileNamePrefix); + } + + if (adjustCovariates && covariatesToRemove != null) { + outputFileNamePrefix = adjustCovariates(dataset, outputFileNamePrefix, covariatesToRemove, orthogonalizecovariates, 1E-10); + } + + if (runPCA) { + ConcurrentCorrelation c = new ConcurrentCorrelation(2); + double[][] correlationMatrix = c.pairwiseCorrelation(dataset.getRawDataTransposed()); + Pair, DoubleMatrixDataset> PCAResults = calculatePCA(dataset, correlationMatrix, outputFileNamePrefix, null); + if (nrPCAsOverSamplesToRemove != 0 || nrIntermediatePCAsOverSamplesToRemoveToOutput != 0) { + correctDataForPCs(dataset, outputFileNamePrefix, nrPCAsOverSamplesToRemove, nrIntermediatePCAsOverSamplesToRemoveToOutput, PCAResults.getLeft(), PCAResults.getRight()); + } + } + + if (forceNormalDistribution) { outputFileNamePrefix = forceNormalDistribution(dataset, outputFileNamePrefix); } - } + } + + + NaturalRanking ranking = new NaturalRanking(NaNStrategy.FAILED, TiesStrategy.AVERAGE); + + public double[] forceNormal(double[] data) { + double[] rankedValues = ranking.rank(data); + for (int s = 0; s < data.length; s++) { + //Convert the rank to a proportion, with range <0, 1> + double pValue = (0.5d + rankedValues[s] - 1d) / (double) (rankedValues.length); + //Convert the pValue to a Z-Score: + data[s] = cern.jet.stat.Probability.normalInverse(pValue); + } + return data; + } + - public String forceNormalDistribution(DoubleMatrixDataset dataset, String fileNamePrefix) throws IOException{ + public String forceNormalDistribution(DoubleMatrixDataset dataset, String fileNamePrefix) throws IOException { double[][] rawData = dataset.getRawData(); - - NaturalRanking ranking = new NaturalRanking(NaNStrategy.FAILED, TiesStrategy.AVERAGE); - for (int p = 0; p < dataset.rowObjects.size(); p++) { - - double[] rankedValues = ranking.rank(rawData[p]); - - for (int s = 0; s < dataset.colObjects.size(); s++) { - //Convert the rank to a proportion, with range <0, 1> - double pValue = (0.5d + rankedValues[s] - 1d) / (double) (rankedValues.length); - //Convert the pValue to a Z-Score: - rawData[p][s] = cern.jet.stat.Probability.normalInverse(pValue); - } - } - + rawData[p] = forceNormal(rawData[p]); + } + DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawData, dataset.rowObjects, dataset.colObjects); - fileNamePrefix += ".ForcedNormal"; - datasetNormalized.save(fileNamePrefix + ".txt.gz"); - return fileNamePrefix; - - + fileNamePrefix += ".ForcedNormal"; + datasetNormalized.save(fileNamePrefix + ".txt.gz"); + return fileNamePrefix; + + + } + + public String quantileNormalize(DoubleMatrixDataset dataset, String fileNamePrefix, boolean forceMissingValues, boolean forceReplacementOfMissingValues, boolean forceReplacementOfMissingValues2, boolean treatZerosAsNulls) throws IOException { + double[][] rawData = dataset.getRawData(); + + boolean dataContainsNulls = MatrixTools.containsNaNs(rawData); + + if (treatZerosAsNulls && dataContainsNulls) { + System.out.println("Warning: Data already contains nulls before treating zeros as nulls.\n Later on it will not be possible to distinguish between those two!"); + } + if (treatZerosAsNulls) { + MatrixHandling.ReplaceZerosToNull(rawData); + dataContainsNulls = MatrixTools.containsNaNs(rawData); + } + + if (!dataContainsNulls) { + QuantileNormalization.quantilenormalize(rawData); + } else if (forceReplacementOfMissingValues) { + QuantileNormalization.QuantileNormAdressingNaValuesAfterInitialQN(dataset, false, false, false); + } else if (forceReplacementOfMissingValues2) { + QuantileNormalization.QuantileNormAdressingNaValuesAfterInitialQN(dataset, false, true, false); + } else if (forceMissingValues && treatZerosAsNulls) { + QuantileNormalization.QuantileNormAdressingNaValuesAfterInitialQN(dataset, true, false, true); + } else if (forceMissingValues) { + QuantileNormalization.QuantileNormAdressingNaValuesAfterInitialQN(dataset, true, false, false); + } else { + System.out.println("Warning: Your data contains missing values and missing value treatment is not selected.\n" + + "If desired please supply additional flag: --forceMissingValues or --forceReplacementOfMissingValues"); + System.exit(0); + } + + if (treatZerosAsNulls) { + MatrixHandling.ReplaceNullToZero(rawData); + } + + DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawData, dataset.rowObjects, dataset.colObjects); + fileNamePrefix += ".QuantileNormalized"; + datasetNormalized.save(fileNamePrefix + ".txt.gz"); + + return fileNamePrefix; + } + + public String log2transform(DoubleMatrixDataset dataset, String fileNamePrefix) throws IOException { + double[][] rawData = dataset.getRawData(); + Log2Transform.log2transform(rawData); + DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawData, dataset.rowObjects, dataset.colObjects); + fileNamePrefix += ".Log2Transformed"; + datasetNormalized.save(fileNamePrefix + ".txt.gz"); + return fileNamePrefix; } - - public String quantileNormalize(DoubleMatrixDataset dataset, String fileNamePrefix, boolean forceMissingValues, boolean forceReplacementOfMissingValues, boolean forceReplacementOfMissingValues2, boolean treatZerosAsNulls) throws IOException { - double[][] rawData = dataset.getRawData(); - - boolean dataContainsNulls = MatrixTools.containsNaNs(rawData); - - if(treatZerosAsNulls && dataContainsNulls){ - System.out.println("Warning: Data already contains nulls before treating zeros as nulls.\n Later on it will not be possible to distinguish between those two!"); - } - if(treatZerosAsNulls){ - MatrixHandling.ReplaceZerosToNull(rawData); - dataContainsNulls = MatrixTools.containsNaNs(rawData); - } - - if (!dataContainsNulls) { - QuantileNormalization.quantilenormalize(rawData); - } else if(forceReplacementOfMissingValues){ - QuantileNormalization.QuantileNormAdressingNaValuesAfterInitialQN(dataset, false, false, false); - } else if(forceReplacementOfMissingValues2){ - QuantileNormalization.QuantileNormAdressingNaValuesAfterInitialQN(dataset, false, true, false); - } else if(forceMissingValues && treatZerosAsNulls){ - QuantileNormalization.QuantileNormAdressingNaValuesAfterInitialQN(dataset, true, false, true); - } else if(forceMissingValues){ - QuantileNormalization.QuantileNormAdressingNaValuesAfterInitialQN(dataset, true, false, false); - } else { - System.out.println("Warning: Your data contains missing values and missing value treatment is not selected.\n" - + "If desired please supply additional flag: --forceMissingValues or --forceReplacementOfMissingValues"); - System.exit(0); - } - - if(treatZerosAsNulls){ - MatrixHandling.ReplaceNullToZero(rawData); - } - - DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawData, dataset.rowObjects, dataset.colObjects); - fileNamePrefix += ".QuantileNormalized"; - datasetNormalized.save(fileNamePrefix + ".txt.gz"); - - return fileNamePrefix; - } - - public String log2transform(DoubleMatrixDataset dataset, String fileNamePrefix) throws IOException { - double[][] rawData = dataset.getRawData(); - Log2Transform.log2transform(rawData); - DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawData, dataset.rowObjects, dataset.colObjects); - fileNamePrefix += ".Log2Transformed"; - datasetNormalized.save(fileNamePrefix + ".txt.gz"); - return fileNamePrefix; - } - - public String mValueTransform(DoubleMatrixDataset dataset, String fileNamePrefix) throws IOException { - double[][] rawData = dataset.getRawData(); - ConvertBetaAndMvalues.transformToMvalue(rawData); - DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawData, dataset.rowObjects, dataset.colObjects); - fileNamePrefix += ".MvalueTransformed"; - datasetNormalized.save(fileNamePrefix + ".txt.gz"); - return fileNamePrefix; - } - - public String centerAndScale(DoubleMatrixDataset dataset, String fileNamePrefix) throws IOException { - double[][] rawData = dataset.getRawData(); - System.out.println("Standardizing probe mean"); - for (int p = 0; p < dataset.rowObjects.size(); p++) { - double mean = Descriptives.mean(rawData[p]); - //double stdev = Math.sqrt(Descriptives.variance(rawData[p], mean)); - for (int s = 0; s < dataset.colObjects.size(); s++) { - rawData[p][s] -= mean; - } - } - - dataset.setRawData(rawData); - fileNamePrefix += ".ProbesCentered"; - dataset.save(fileNamePrefix + ".txt.gz"); - - System.out.println("- Standardizing sample mean and standard deviation"); - for (int s = 0; s < dataset.colObjects.size(); s++) { - double[] vals = new double[dataset.rowObjects.size()]; - for (int p = 0; p < dataset.rowObjects.size(); p++) { - vals[p] = dataset.getRawData()[p][s]; - } - double mean = Descriptives.mean(vals); - for (int p = 0; p < dataset.rowObjects.size(); p++) { - vals[p] -= mean; - } - double var = Descriptives.variance(vals, mean); - double stdev = Math.sqrt(var); - for (int p = 0; p < dataset.rowObjects.size(); p++) { - dataset.getRawData()[p][s] = (vals[p] / stdev); - } - } - - DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawData, dataset.rowObjects, dataset.colObjects); - fileNamePrefix += ".SamplesZTransformed"; - datasetNormalized.save(fileNamePrefix + ".txt.gz"); - return fileNamePrefix; - } - - public String adjustCovariates(DoubleMatrixDataset traitData, String fileNamePrefix, String covariatesToRemove, boolean orthogonalizecovariates, double varianceExplainedCutoff) throws IOException { - // load covariate data, and remove samples for which there is missing covariate data. - Pair, DoubleMatrixDataset> covariateData = loadCovariateValues(covariatesToRemove, traitData); - DoubleMatrixDataset covariateDataset = covariateData.getLeft(); - DoubleMatrixDataset traitDataUpdated = covariateData.getRight(); - - traitData.rawData = traitDataUpdated.rawData; - traitData.colObjects = traitDataUpdated.colObjects; - traitData.rowObjects = traitDataUpdated.rowObjects; - traitData.recalculateHashMaps(); - - double[][] covariateValues = null; - double[] pcaExpVar = null; - - System.out.println("Covariate data has " + covariateDataset.nrRows + " rows and " + covariateDataset.nrCols + " columns."); - - for (int p = 0; p < covariateDataset.rowObjects.size(); p++) { - double mean = Descriptives.mean(covariateDataset.getRawData()[p]); - double stdev = Math.sqrt(Descriptives.variance(covariateDataset.getRawData()[p], mean)); - for (int s = 0; s < covariateDataset.colObjects.size(); s++) { - covariateDataset.getRawData()[p][s] -= mean; - covariateDataset.getRawData()[p][s] /= stdev; - } - } - - //Covariation on a centered and scaled matrix equals the correlation. - //Covariation is faster to compute. - ConcurrentCovariation c = new ConcurrentCovariation(2); - double[][] correlationMatrix = c.pairwiseCovariation(covariateDataset.getRawData()); - covariateDataset.transposeDataset(); - Pair, DoubleMatrixDataset> PCAResults = calculatePCA(covariateDataset, correlationMatrix, covariatesToRemove, null); - - // replace covariateValues with orthogonal ones... - covariateDataset = PCAResults.getLeft(); - - - covariateDataset.transposeDataset(); - covariateValues = covariateDataset.getRawData(); - - System.out.println(covariateDataset.nrRows + " covariates finally loaded."); - - // load the eigenvalues - pcaExpVar = new double[covariateValues.length]; - System.out.println("Loading eigenvalues from: " + covariatesToRemove + ".PCAOverSamplesEigenvalues.txt.gz"); - TextFile tf = new TextFile(covariatesToRemove + ".PCAOverSamplesEigenvalues.txt.gz", TextFile.R); // - // skip header - tf.readLine(); - String[] elems = tf.readLineElems(TextFile.tab); - while (elems != null) { - if (elems.length > 2) { - int pcanr = Integer.parseInt(elems[0]); - double expvar = Double.parseDouble(elems[1]); - pcaExpVar[pcanr - 1] = expvar; - System.out.println(pcanr + "\t" + expvar); - } - elems = tf.readLineElems(TextFile.tab); - } - tf.close(); + + public String mValueTransform(DoubleMatrixDataset dataset, String fileNamePrefix) throws IOException { + double[][] rawData = dataset.getRawData(); + ConvertBetaAndMvalues.transformToMvalue(rawData); + DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawData, dataset.rowObjects, dataset.colObjects); + fileNamePrefix += ".MvalueTransformed"; + datasetNormalized.save(fileNamePrefix + ".txt.gz"); + return fileNamePrefix; + } + + public String centerAndScale(DoubleMatrixDataset dataset, String fileNamePrefix) throws IOException { + double[][] rawData = dataset.getRawData(); + System.out.println("Standardizing probe mean"); + for (int p = 0; p < dataset.rowObjects.size(); p++) { + double mean = Descriptives.mean(rawData[p]); + //double stdev = Math.sqrt(Descriptives.variance(rawData[p], mean)); + for (int s = 0; s < dataset.colObjects.size(); s++) { + rawData[p][s] -= mean; + } + } + + dataset.setRawData(rawData); + fileNamePrefix += ".ProbesCentered"; + dataset.save(fileNamePrefix + ".txt.gz"); + + System.out.println("- Standardizing sample mean and standard deviation"); + for (int s = 0; s < dataset.colObjects.size(); s++) { + double[] vals = new double[dataset.rowObjects.size()]; + for (int p = 0; p < dataset.rowObjects.size(); p++) { + vals[p] = dataset.getRawData()[p][s]; + } + double mean = Descriptives.mean(vals); + for (int p = 0; p < dataset.rowObjects.size(); p++) { + vals[p] -= mean; + } + double var = Descriptives.variance(vals, mean); + double stdev = Math.sqrt(var); + for (int p = 0; p < dataset.rowObjects.size(); p++) { + dataset.getRawData()[p][s] = (vals[p] / stdev); + } + } + + DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawData, dataset.rowObjects, dataset.colObjects); + fileNamePrefix += ".SamplesZTransformed"; + datasetNormalized.save(fileNamePrefix + ".txt.gz"); + return fileNamePrefix; + } + + public String adjustCovariates(DoubleMatrixDataset traitData, String fileNamePrefix, String covariatesToRemove, boolean orthogonalizecovariates, double varianceExplainedCutoff) throws IOException { + // load covariate data, and remove samples for which there is missing covariate data. + Pair, DoubleMatrixDataset> covariateData = loadCovariateValues(covariatesToRemove, traitData); + DoubleMatrixDataset covariateDataset = covariateData.getLeft(); + DoubleMatrixDataset traitDataUpdated = covariateData.getRight(); + + traitData.rawData = traitDataUpdated.rawData; + traitData.colObjects = traitDataUpdated.colObjects; + traitData.rowObjects = traitDataUpdated.rowObjects; + traitData.recalculateHashMaps(); + + double[][] covariateValues = null; + double[] pcaExpVar = null; + + System.out.println("Covariate data has " + covariateDataset.nrRows + " rows and " + covariateDataset.nrCols + " columns."); + + for (int p = 0; p < covariateDataset.rowObjects.size(); p++) { + double mean = Descriptives.mean(covariateDataset.getRawData()[p]); + double stdev = Math.sqrt(Descriptives.variance(covariateDataset.getRawData()[p], mean)); + for (int s = 0; s < covariateDataset.colObjects.size(); s++) { + covariateDataset.getRawData()[p][s] -= mean; + covariateDataset.getRawData()[p][s] /= stdev; + } + } + + //Covariation on a centered and scaled matrix equals the correlation. + //Covariation is faster to compute. + ConcurrentCovariation c = new ConcurrentCovariation(2); + double[][] correlationMatrix = c.pairwiseCovariation(covariateDataset.getRawData()); + covariateDataset.transposeDataset(); + Pair, DoubleMatrixDataset> PCAResults = calculatePCA(covariateDataset, correlationMatrix, covariatesToRemove, null); + + // replace covariateValues with orthogonal ones... + covariateDataset = PCAResults.getLeft(); + + + covariateDataset.transposeDataset(); + covariateValues = covariateDataset.getRawData(); + + System.out.println(covariateDataset.nrRows + " covariates finally loaded."); + + // load the eigenvalues + pcaExpVar = new double[covariateValues.length]; + System.out.println("Loading eigenvalues from: " + covariatesToRemove + ".PCAOverSamplesEigenvalues.txt.gz"); + TextFile tf = new TextFile(covariatesToRemove + ".PCAOverSamplesEigenvalues.txt.gz", TextFile.R); // + // skip header + tf.readLine(); + String[] elems = tf.readLineElems(TextFile.tab); + while (elems != null) { + if (elems.length > 2) { + int pcanr = Integer.parseInt(elems[0]); + double expvar = Double.parseDouble(elems[1]); + pcaExpVar[pcanr - 1] = expvar; + System.out.println(pcanr + "\t" + expvar); + } + elems = tf.readLineElems(TextFile.tab); + } + tf.close(); // } else { // // PCA has been performed a-priori. Just check whether the user has supplied proper covariates. // if (covariateValues.length > 1) { @@ -394,247 +398,248 @@ public String adjustCovariates(DoubleMatrixDataset traitData, St // } - double[][] rawdata = traitData.getRawData(); - for (int i = 0; i < covariateValues.length; i++) { - if (pcaExpVar == null || pcaExpVar[i] > varianceExplainedCutoff) { - correctForCovariate(rawdata, covariateValues, i); - } else { - System.out.println("Not regressing covariate: " + i + " because explained variance < " + varianceExplainedCutoff + ": " + pcaExpVar[i]); - } - } + double[][] rawdata = traitData.getRawData(); + for (int i = 0; i < covariateValues.length; i++) { + if (pcaExpVar == null || pcaExpVar[i] > varianceExplainedCutoff) { + correctForCovariate(rawdata, covariateValues, i); + } else { + System.out.println("Not regressing covariate: " + i + " because explained variance < " + varianceExplainedCutoff + ": " + pcaExpVar[i]); + } + } - DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawdata, traitData.rowObjects, traitData.colObjects); - fileNamePrefix += ".CovariatesRemoved"; - datasetNormalized.save(fileNamePrefix + ".txt.gz"); + traitData.rawData = rawdata; + + //Why was this done??????? + //DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawdata, traitData.rowObjects, traitData.colObjects); + fileNamePrefix += ".CovariatesRemoved"; + traitData.save(fileNamePrefix + ".txt.gz"); - traitData.rawData = rawdata; + + return fileNamePrefix; + } - return fileNamePrefix; - } + /** + * Calculate correlation over columns in DoubleMatrixDataset. WARNING: this + * method assumes that SD == 1 and mean == 0 (which makes the covariance + * equal to the correlation). + * + * @param dataset + * @return + */ + private double[][] correlateSamples(DoubleMatrixDataset dataset) { + double[][] correlationMatrix = new double[dataset.colObjects.size()][dataset.colObjects.size()]; + double probeCountMinusOne = dataset.rowObjects.size() - 1; + + ProgressBar pb = new ProgressBar(dataset.colObjects.size(), "- Calculating correlations: " + dataset.colObjects.size() + " x " + dataset.colObjects.size()); + + for (int f = 0; f < dataset.colObjects.size(); f++) { + + + for (int g = f; g < dataset.colObjects.size(); g++) { + double covarianceInterim = 0; + for (int p = 0; p < dataset.rowObjects.size(); p++) { + covarianceInterim += dataset.getRawData()[p][f] * dataset.getRawData()[p][g]; + } + double covariance = covarianceInterim / probeCountMinusOne; + correlationMatrix[f][g] = covariance; + correlationMatrix[g][f] = covariance; +// System.out.println(f + "\t" + g + "\t" + covariance); + } + pb.iterate(); + } + pb.close(); + return correlationMatrix; + } - /** - * Calculate correlation over columns in DoubleMatrixDataset. WARNING: this - * method assumes that SD == 1 and mean == 0 (which makes the covariance - * equal to the correlation). - * - * @param dataset - * @return - */ - private double[][] correlateSamples(DoubleMatrixDataset dataset) { - double[][] correlationMatrix = new double[dataset.colObjects.size()][dataset.colObjects.size()]; - double probeCountMinusOne = dataset.rowObjects.size() - 1; + public double[][] correlateProbes(DoubleMatrixDataset dataset) { + + double[][] correlationMatrix = new double[dataset.rowObjects.size()][dataset.rowObjects.size()]; + double probeCountMinusOne = dataset.rowObjects.size() - 1; + + ProgressBar pb = new ProgressBar(dataset.rowObjects.size(), "- Calculating correlations: " + dataset.rowObjects.size() + " x " + dataset.rowObjects.size()); + for (int f = 0; f < dataset.rowObjects.size(); f++) { + for (int g = f; g < dataset.rowObjects.size(); g++) { + double covarianceInterim = 0; + for (int p = 0; p < dataset.rowObjects.size(); p++) { + covarianceInterim += dataset.getRawData()[p][f] * dataset.getRawData()[p][g]; + } + double covariance = covarianceInterim / probeCountMinusOne; + correlationMatrix[f][g] = covariance; + correlationMatrix[g][f] = covariance; + System.out.println(f + "\t" + g + "\t" + covariance); + } + pb.iterate(); + } + pb.close(); + return correlationMatrix; + } - ProgressBar pb = new ProgressBar(dataset.colObjects.size(), "- Calculating correlations: " + dataset.colObjects.size() + " x " + dataset.colObjects.size()); + public Pair, DoubleMatrixDataset> calculatePCA(DoubleMatrixDataset dataset, double[][] correlationMatrix, String fileNamePrefix, Integer nrOfPCsToCalculate) throws IOException { + String expressionFile = fileNamePrefix; + System.out.println("Calculating PCA over file: " + fileNamePrefix); + System.out.println("- Performing PCA over correlation matrix of size: " + correlationMatrix.length + "x" + correlationMatrix.length); + Jama.EigenvalueDecomposition eig = PCA.eigenValueDecomposition(correlationMatrix); - for (int f = 0; f < dataset.colObjects.size(); f++) { + if (nrOfPCsToCalculate == null || nrOfPCsToCalculate > dataset.colObjects.size()) { + nrOfPCsToCalculate = dataset.colObjects.size(); + } else if (nrOfPCsToCalculate < 1) { + throw new IllegalArgumentException("Number of PCs to calculate should be at least 1"); + } + DoubleMatrixDataset datasetEV = new DoubleMatrixDataset(dataset.colObjects.size(), nrOfPCsToCalculate); + datasetEV.rowObjects = dataset.colObjects; + double[] eigenValues = eig.getRealEigenvalues(); + System.out.println("Eigenvalue results:"); + + System.out.println("PCA\tPCANr\tEigenValue\tExplainedVariance\tTotalExplainedVariance"); + + TextFile out = new TextFile(expressionFile + ".PCAOverSamplesEigenvalues.txt.gz", TextFile.W); + double cumExpVarPCA = 0; + + out.writeln("PCA\tPCANr\tEigenValue\tExplainedVariance\tTotalExplainedVariance"); + + for (int pca = 0; pca < nrOfPCsToCalculate; pca++) { + double expVarPCA = PCA.getEigenValueVar(eigenValues, pca); + double[] pca1ExpEigenVector = PCA.getEigenVector(eig, eigenValues, pca); + for (int s = 0; s < dataset.colObjects.size(); s++) { + datasetEV.getRawData()[s][pca] = pca1ExpEigenVector[s]; + } + int pcaNr = pca + 1; + cumExpVarPCA += expVarPCA; + out.write(pcaNr + "\t" + eigenValues[eigenValues.length - 1 - pca] + "\t" + expVarPCA + "\t" + cumExpVarPCA + "\n"); + datasetEV.colObjects.set(pca, "Comp" + String.valueOf(pcaNr)); + System.out.println("PCA:\t" + pcaNr + "\t" + eigenValues[eigenValues.length - 1 - pca] + "\t" + expVarPCA + "\t" + cumExpVarPCA); + } + out.close(); + datasetEV.save(expressionFile + ".PCAOverSamplesEigenvectors.txt.gz"); - for (int g = f; g < dataset.colObjects.size(); g++) { - double covarianceInterim = 0; - for (int p = 0; p < dataset.rowObjects.size(); p++) { - covarianceInterim += dataset.getRawData()[p][f] * dataset.getRawData()[p][g]; - } - double covariance = covarianceInterim / probeCountMinusOne; - correlationMatrix[f][g] = covariance; - correlationMatrix[g][f] = covariance; -// System.out.println(f + "\t" + g + "\t" + covariance); - } - pb.iterate(); - } - pb.close(); - return correlationMatrix; - } - - public double[][] correlateProbes(DoubleMatrixDataset dataset) { - - double[][] correlationMatrix = new double[dataset.rowObjects.size()][dataset.rowObjects.size()]; - double probeCountMinusOne = dataset.rowObjects.size() - 1; - - ProgressBar pb = new ProgressBar(dataset.rowObjects.size(), "- Calculating correlations: " + dataset.rowObjects.size() + " x " + dataset.rowObjects.size()); - for (int f = 0; f < dataset.rowObjects.size(); f++) { - for (int g = f; g < dataset.rowObjects.size(); g++) { - double covarianceInterim = 0; - for (int p = 0; p < dataset.rowObjects.size(); p++) { - covarianceInterim += dataset.getRawData()[p][f] * dataset.getRawData()[p][g]; - } - double covariance = covarianceInterim / probeCountMinusOne; - correlationMatrix[f][g] = covariance; - correlationMatrix[g][f] = covariance; - System.out.println(f + "\t" + g + "\t" + covariance); - } - pb.iterate(); - } - pb.close(); - return correlationMatrix; - } - - public Pair, DoubleMatrixDataset> calculatePCA(DoubleMatrixDataset dataset, double[][] correlationMatrix, String fileNamePrefix, Integer nrOfPCsToCalculate) throws IOException { - String expressionFile = fileNamePrefix; - System.out.println("Calculating PCA over file: " + fileNamePrefix); - System.out.println("- Performing PCA over correlation matrix of size: " + correlationMatrix.length + "x" + correlationMatrix.length); - Jama.EigenvalueDecomposition eig = PCA.eigenValueDecomposition(correlationMatrix); - - if (nrOfPCsToCalculate == null || nrOfPCsToCalculate > dataset.colObjects.size()) { - nrOfPCsToCalculate = dataset.colObjects.size(); - } else if (nrOfPCsToCalculate < 1) { - throw new IllegalArgumentException("Number of PCs to calculate should be at least 1"); - } - - DoubleMatrixDataset datasetEV = new DoubleMatrixDataset(dataset.colObjects.size(), nrOfPCsToCalculate); - datasetEV.rowObjects = dataset.colObjects; - double[] eigenValues = eig.getRealEigenvalues(); - System.out.println("Eigenvalue results:"); - - System.out.println("PCA\tPCANr\tEigenValue\tExplainedVariance\tTotalExplainedVariance"); - - TextFile out = new TextFile(expressionFile + ".PCAOverSamplesEigenvalues.txt.gz", TextFile.W); - double cumExpVarPCA = 0; - - out.writeln("PCA\tPCANr\tEigenValue\tExplainedVariance\tTotalExplainedVariance"); - - for (int pca = 0; pca < nrOfPCsToCalculate; pca++) { - double expVarPCA = PCA.getEigenValueVar(eigenValues, pca); - double[] pca1ExpEigenVector = PCA.getEigenVector(eig, eigenValues, pca); - for (int s = 0; s < dataset.colObjects.size(); s++) { - datasetEV.getRawData()[s][pca] = pca1ExpEigenVector[s]; - } - int pcaNr = pca + 1; - cumExpVarPCA += expVarPCA; - out.write(pcaNr + "\t" + eigenValues[eigenValues.length - 1 - pca] + "\t" + expVarPCA + "\t" + cumExpVarPCA + "\n"); - datasetEV.colObjects.set(pca, "Comp" + String.valueOf(pcaNr)); - System.out.println("PCA:\t" + pcaNr + "\t" + eigenValues[eigenValues.length - 1 - pca] + "\t" + expVarPCA + "\t" + cumExpVarPCA); - } - out.close(); - - datasetEV.save(expressionFile + ".PCAOverSamplesEigenvectors.txt.gz"); - - datasetEV.transposeDataset(); - - datasetEV.save(expressionFile + ".PCAOverSamplesEigenvectorsTransposed.txt.gz"); - - datasetEV.transposeDataset(); - System.out.println("Calculating PCs"); - System.out.println("Initializing PCA matrix"); - DoubleMatrixDataset datasetPCAOverSamplesPCAs = new DoubleMatrixDataset(dataset.rowObjects.size(), nrOfPCsToCalculate); - datasetPCAOverSamplesPCAs.rowObjects = dataset.rowObjects; - for (int s = 0; s < nrOfPCsToCalculate; s++) { - datasetPCAOverSamplesPCAs.colObjects.set(s, "Comp" + String.valueOf(s + 1)); - } - for (int p = 0; p < dataset.rowObjects.size(); p++) { - for (int t = 0; t < nrOfPCsToCalculate; t++) { - datasetPCAOverSamplesPCAs.getRawData()[p][t] = 0; - } - } - - - ProgressBar pb = new ProgressBar(dataset.rowObjects.size(), "Calculating the PCA scores per probe: "); - for (int probe = 0; probe < dataset.rowObjects.size(); probe++) { - for (int sample1 = 0; sample1 < nrOfPCsToCalculate; sample1++) { - for (int sample2 = 0; sample2 < dataset.colObjects.size(); sample2++) { - double probeCoefficient = datasetEV.getRawData()[sample2][sample1]; - datasetPCAOverSamplesPCAs.getRawData()[probe][sample1] += probeCoefficient * dataset.getRawData()[probe][sample2]; - } - } - pb.iterate(); - } - pb.close(); - - String outfilename = expressionFile + ".PCAOverSamplesPrincipalComponents.txt.gz"; - System.out.println("Saving PCA scores: " + outfilename); - datasetPCAOverSamplesPCAs.save(outfilename); - - return new Pair, DoubleMatrixDataset>(datasetPCAOverSamplesPCAs, datasetEV); - } - - public void correctDataForPCs(DoubleMatrixDataset dataset, String fileNamePrefix, int nrPCAsOverSamplesToRemove, int nrIntermediatePCAsOverSamplesToRemoveToOutput, - DoubleMatrixDataset datasetPCAOverSamplesPCAs, DoubleMatrixDataset datasetEV) throws IOException { - String expressionFile = fileNamePrefix; - System.out.println("\nInitializing residual gene expression matrix"); - - if (dataset.colObjects.size() < nrPCAsOverSamplesToRemove) { - int remainder = dataset.colObjects.size() % nrIntermediatePCAsOverSamplesToRemoveToOutput; - nrPCAsOverSamplesToRemove = dataset.colObjects.size() - remainder; - } - - for (int t = 0; t < nrPCAsOverSamplesToRemove; t++) { - for (int p = 0; p < dataset.rowObjects.size(); p++) { - for (int s = 0; s < dataset.colObjects.size(); s++) { - dataset.getRawData()[p][s] -= datasetPCAOverSamplesPCAs.getRawData()[p][t] * datasetEV.getRawData()[s][t]; - } - } - int nrPCAs = t + 1; - if (nrIntermediatePCAsOverSamplesToRemoveToOutput > 0 && nrPCAs % nrIntermediatePCAsOverSamplesToRemoveToOutput == 0) { - dataset.save(expressionFile + "." + nrPCAs + "PCAsOverSamplesRemoved.txt.gz"); - System.out.println("Removed\t" + nrPCAs + "\tPCs. File:\t" + expressionFile + "." + nrPCAs + "PCAsOverSamplesRemoved.txt.gz"); - } - - } - dataset.save(expressionFile + "." + nrPCAsOverSamplesToRemove + "PCAsOverSamplesRemoved.txt.gz"); - - } - - public void repeatPCAOmitCertainPCAs(HashSet pcasNotToRemove, String parentDir, String expressionFile, int nrPCAsOverSamplesToRemove, int nrIntermediatePCAsOverSamplesToRemoveToOutput) throws IOException { - System.out.println("Will write output to: "+parentDir); - String[] files = Gpio.getListOfFiles(parentDir); - String startExpressionFileName = expressionFile; - File st = new File(startExpressionFileName); - - // strip the parent dir name - parentDir += Gpio.getFileSeparator(); - String minimalFilename = st.getName(); - String[] expressionFileNameElems = minimalFilename.split("\\."); - String eigenvectorFile = null; - String principalComponentsFile = null; - - if(minimalFilename.contains("PCAsOverSamplesRemoved")){ - StringBuilder newMinimal = new StringBuilder(); - newMinimal.append(expressionFileNameElems[0]); - for(int i = 1; i datasetPCAOverSamplesPCAs = new DoubleMatrixDataset(dataset.rowObjects.size(), nrOfPCsToCalculate); + datasetPCAOverSamplesPCAs.rowObjects = dataset.rowObjects; + for (int s = 0; s < nrOfPCsToCalculate; s++) { + datasetPCAOverSamplesPCAs.colObjects.set(s, "Comp" + String.valueOf(s + 1)); + } + for (int p = 0; p < dataset.rowObjects.size(); p++) { + for (int t = 0; t < nrOfPCsToCalculate; t++) { + datasetPCAOverSamplesPCAs.getRawData()[p][t] = 0; + } + } + + + ProgressBar pb = new ProgressBar(dataset.rowObjects.size(), "Calculating the PCA scores per probe: "); + for (int probe = 0; probe < dataset.rowObjects.size(); probe++) { + for (int sample1 = 0; sample1 < nrOfPCsToCalculate; sample1++) { + for (int sample2 = 0; sample2 < dataset.colObjects.size(); sample2++) { + double probeCoefficient = datasetEV.getRawData()[sample2][sample1]; + datasetPCAOverSamplesPCAs.getRawData()[probe][sample1] += probeCoefficient * dataset.getRawData()[probe][sample2]; + } + } + pb.iterate(); + } + pb.close(); + + String outfilename = expressionFile + ".PCAOverSamplesPrincipalComponents.txt.gz"; + System.out.println("Saving PCA scores: " + outfilename); + datasetPCAOverSamplesPCAs.save(outfilename); + + return new Pair, DoubleMatrixDataset>(datasetPCAOverSamplesPCAs, datasetEV); + } + + public void correctDataForPCs(DoubleMatrixDataset dataset, String fileNamePrefix, int nrPCAsOverSamplesToRemove, int nrIntermediatePCAsOverSamplesToRemoveToOutput, + DoubleMatrixDataset datasetPCAOverSamplesPCAs, DoubleMatrixDataset datasetEV) throws IOException { + String expressionFile = fileNamePrefix; + System.out.println("\nInitializing residual gene expression matrix"); + + if (dataset.colObjects.size() < nrPCAsOverSamplesToRemove) { + int remainder = dataset.colObjects.size() % nrIntermediatePCAsOverSamplesToRemoveToOutput; + nrPCAsOverSamplesToRemove = dataset.colObjects.size() - remainder; + } + + for (int t = 0; t < nrPCAsOverSamplesToRemove; t++) { + for (int p = 0; p < dataset.rowObjects.size(); p++) { + for (int s = 0; s < dataset.colObjects.size(); s++) { + dataset.getRawData()[p][s] -= datasetPCAOverSamplesPCAs.getRawData()[p][t] * datasetEV.getRawData()[s][t]; + } + } + int nrPCAs = t + 1; + if (nrIntermediatePCAsOverSamplesToRemoveToOutput > 0 && nrPCAs % nrIntermediatePCAsOverSamplesToRemoveToOutput == 0) { + dataset.save(expressionFile + "." + nrPCAs + "PCAsOverSamplesRemoved.txt.gz"); + System.out.println("Removed\t" + nrPCAs + "\tPCs. File:\t" + expressionFile + "." + nrPCAs + "PCAsOverSamplesRemoved.txt.gz"); + } + + } + dataset.save(expressionFile + "." + nrPCAsOverSamplesToRemove + "PCAsOverSamplesRemoved.txt.gz"); + + } + + public void repeatPCAOmitCertainPCAs(HashSet pcasNotToRemove, String parentDir, String expressionFile, int nrPCAsOverSamplesToRemove, int nrIntermediatePCAsOverSamplesToRemoveToOutput) throws IOException { + System.out.println("Will write output to: " + parentDir); + String[] files = Gpio.getListOfFiles(parentDir); + String startExpressionFileName = expressionFile; + File st = new File(startExpressionFileName); + + // strip the parent dir name + parentDir += Gpio.getFileSeparator(); + String minimalFilename = st.getName(); + String[] expressionFileNameElems = minimalFilename.split("\\."); + String eigenvectorFile = null; + String principalComponentsFile = null; + + if (minimalFilename.contains("PCAsOverSamplesRemoved")) { + StringBuilder newMinimal = new StringBuilder(); + newMinimal.append(expressionFileNameElems[0]); + for (int i = 1; i < expressionFileNameElems.length; ++i) { + if (!expressionFileNameElems[i].contains("PCAsOverSamplesRemoved")) { + newMinimal.append(".").append(expressionFileNameElems[i]); + } + } + minimalFilename = newMinimal.toString(); + } + + for (String file : files) { // if (file.length() < minimalFilename.length() && file.contains(expressionFileNameElems[0])) { // minimalFilename = file; // } else - if (file.toLowerCase().contains("pcaoversampleseigenvectors.")) { - eigenvectorFile = parentDir + "" + file; - } else if (file.toLowerCase().contains("pcaoversamplesprincipalcomponents")) { - principalComponentsFile = parentDir + "" + file; - } - } - - boolean fileFound = true; - if (eigenvectorFile == null) { - System.err.println("Could not find file containing 'PCAOverSamplesEigenvectors' in directory: " + parentDir); - fileFound = false; - } - - if (eigenvectorFile == null) { - System.err.println("Could not find file containing 'PCAOverSamplesPrincipalComponents' in directory: " + parentDir); - fileFound = false; - } - - if (!fileFound) { - System.exit(0); - } - - System.out.println("Detected core file name to be: " + minimalFilename); - - DoubleMatrixDataset expressionDataset = new DoubleMatrixDataset(parentDir+minimalFilename); - DoubleMatrixDataset datasetPCAOverSamplesPCAs = new DoubleMatrixDataset(principalComponentsFile); - DoubleMatrixDataset datasetEV = new DoubleMatrixDataset(eigenvectorFile); - - if (expressionDataset.colObjects.size() < nrPCAsOverSamplesToRemove) { - int remainder = expressionDataset.colObjects.size() % nrIntermediatePCAsOverSamplesToRemoveToOutput; - nrPCAsOverSamplesToRemove = expressionDataset.colObjects.size() - remainder; - } + if (file.toLowerCase().contains("pcaoversampleseigenvectors.")) { + eigenvectorFile = parentDir + "" + file; + } else if (file.toLowerCase().contains("pcaoversamplesprincipalcomponents")) { + principalComponentsFile = parentDir + "" + file; + } + } + + boolean fileFound = true; + if (eigenvectorFile == null) { + System.err.println("Could not find file containing 'PCAOverSamplesEigenvectors' in directory: " + parentDir); + fileFound = false; + } + + if (eigenvectorFile == null) { + System.err.println("Could not find file containing 'PCAOverSamplesPrincipalComponents' in directory: " + parentDir); + fileFound = false; + } + + if (!fileFound) { + System.exit(0); + } + + System.out.println("Detected core file name to be: " + minimalFilename); + + DoubleMatrixDataset expressionDataset = new DoubleMatrixDataset(parentDir + minimalFilename); + DoubleMatrixDataset datasetPCAOverSamplesPCAs = new DoubleMatrixDataset(principalComponentsFile); + DoubleMatrixDataset datasetEV = new DoubleMatrixDataset(eigenvectorFile); + + if (expressionDataset.colObjects.size() < nrPCAsOverSamplesToRemove) { + int remainder = expressionDataset.colObjects.size() % nrIntermediatePCAsOverSamplesToRemoveToOutput; + nrPCAsOverSamplesToRemove = expressionDataset.colObjects.size() - remainder; + } // DoubleMatrixDataset datasetResidualExpressionBasedOnPCAOverSamples = new DoubleMatrixDataset(expressionDataset.rowObjects.size(), expressionDataset.colObjects.size()); // datasetResidualExpressionBasedOnPCAOverSamples.rowObjects = expressionDataset.rowObjects; @@ -644,378 +649,377 @@ public void repeatPCAOmitCertainPCAs(HashSet pcasNotToRemove, String pa // System.arraycopy(expressionDataset.getRawData()[p], 0, datasetResidualExpressionBasedOnPCAOverSamples.getRawData()[p], 0, expressionDataset.colObjects.size()); // } - if(minimalFilename.endsWith(".txt")){ - minimalFilename = minimalFilename.substring(0, minimalFilename.length()-4); - } else if(minimalFilename.endsWith(".txt.gz")){ - minimalFilename = minimalFilename.substring(0, minimalFilename.length()-7); - } - - for (int t = 0; t < nrPCAsOverSamplesToRemove; t++) { - if (!pcasNotToRemove.contains(t + 1)) { - - for (int p = 0; p < expressionDataset.rowObjects.size(); p++) { - for (int s = 0; s < expressionDataset.colObjects.size(); s++) { - //datasetResidualExpressionBasedOnPCAOverSamples.rawData[p][s]-= datasetPCAOverSamplesPCAs.rawData[p][t] * datasetEV.rawData[s][t]; - expressionDataset.getRawData()[p][s] -= datasetPCAOverSamplesPCAs.getRawData()[p][t] * datasetEV.getRawData()[s][t]; - } - } - } else { - System.out.println("Omitting PCA: " + (t + 1) + " since this component is under genetic control"); - } - - int nrPCAs = t + 1; - - if (nrIntermediatePCAsOverSamplesToRemoveToOutput > 0 && nrPCAs % nrIntermediatePCAsOverSamplesToRemoveToOutput == 0) { - //datasetResidualExpressionBasedOnPCAOverSamples.save(expressionFile + "." + nrPCAs + "PCAsOverSamplesRemoved.txt"); - expressionDataset.save(parentDir+minimalFilename + "." + nrPCAs + "PCAsOverSamplesRemoved-GeneticVectorsNotRemoved.txt.gz"); - System.out.println("Removed\t" + nrPCAs + "\tPCs. File:\t" + minimalFilename + "." + nrPCAs + "PCAsOverSamplesRemoved-GeneticVectorsNotRemoved.txt.gz"); - } - - } - //datasetResidualExpressionBasedOnPCAOverSamples.save(expressionFile + "." + nrPCAsOverSamplesToRemove + "PCAsOverSamplesRemoved.txt"); - expressionDataset.save(parentDir+minimalFilename + "." + nrPCAsOverSamplesToRemove + "PCAsOverSamplesRemoved-GeneticVectorsNotRemoved.txt.gz"); - - System.out.println("Done\n"); - } - - private void correctForCovariate(double[][] rawdata, double[][] covariateValues, int covariateToCorrect) { - for (int probe = 0; probe < rawdata.length; probe++) { - double[] y = rawdata[probe]; - double meanY = JSci.maths.ArrayMath.mean(y); - double varianceY = JSci.maths.ArrayMath.variance(y); - double[] x = covariateValues[covariateToCorrect]; - - - - double[] rc = Regression.getLinearRegressionCoefficients(x, y); - double correlation = JSci.maths.ArrayMath.correlation(x, y); - double propExplainedVarianceTrait = correlation * correlation - 1.0d / (double) y.length; - - if (propExplainedVarianceTrait < 0) { - propExplainedVarianceTrait = 0; - } + if (minimalFilename.endsWith(".txt")) { + minimalFilename = minimalFilename.substring(0, minimalFilename.length() - 4); + } else if (minimalFilename.endsWith(".txt.gz")) { + minimalFilename = minimalFilename.substring(0, minimalFilename.length() - 7); + } + + for (int t = 0; t < nrPCAsOverSamplesToRemove; t++) { + if (!pcasNotToRemove.contains(t + 1)) { + + for (int p = 0; p < expressionDataset.rowObjects.size(); p++) { + for (int s = 0; s < expressionDataset.colObjects.size(); s++) { + //datasetResidualExpressionBasedOnPCAOverSamples.rawData[p][s]-= datasetPCAOverSamplesPCAs.rawData[p][t] * datasetEV.rawData[s][t]; + expressionDataset.getRawData()[p][s] -= datasetPCAOverSamplesPCAs.getRawData()[p][t] * datasetEV.getRawData()[s][t]; + } + } + } else { + System.out.println("Omitting PCA: " + (t + 1) + " since this component is under genetic control"); + } + + int nrPCAs = t + 1; + + if (nrIntermediatePCAsOverSamplesToRemoveToOutput > 0 && nrPCAs % nrIntermediatePCAsOverSamplesToRemoveToOutput == 0) { + //datasetResidualExpressionBasedOnPCAOverSamples.save(expressionFile + "." + nrPCAs + "PCAsOverSamplesRemoved.txt"); + expressionDataset.save(parentDir + minimalFilename + "." + nrPCAs + "PCAsOverSamplesRemoved-GeneticVectorsNotRemoved.txt.gz"); + System.out.println("Removed\t" + nrPCAs + "\tPCs. File:\t" + minimalFilename + "." + nrPCAs + "PCAsOverSamplesRemoved-GeneticVectorsNotRemoved.txt.gz"); + } + + } + //datasetResidualExpressionBasedOnPCAOverSamples.save(expressionFile + "." + nrPCAsOverSamplesToRemove + "PCAsOverSamplesRemoved.txt"); + expressionDataset.save(parentDir + minimalFilename + "." + nrPCAsOverSamplesToRemove + "PCAsOverSamplesRemoved-GeneticVectorsNotRemoved.txt.gz"); + + System.out.println("Done\n"); + } + + private void correctForCovariate(double[][] rawdata, double[][] covariateValues, int covariateToCorrect) { + for (int probe = 0; probe < rawdata.length; probe++) { + double[] y = rawdata[probe]; + double meanY = JSci.maths.ArrayMath.mean(y); + double varianceY = JSci.maths.ArrayMath.variance(y); + double[] x = covariateValues[covariateToCorrect]; + + + double[] rc = Regression.getLinearRegressionCoefficients(x, y); + double correlation = JSci.maths.ArrayMath.correlation(x, y); + double propExplainedVarianceTrait = correlation * correlation - 1.0d / (double) y.length; + + if (propExplainedVarianceTrait < 0) { + propExplainedVarianceTrait = 0; + } // explainedVariancePerEQTLProbe[d][(int) Math.round(propExplainedVarianceTrait * 100d)]++; - double[] rawDataUpdated = new double[x.length]; - for (int s = 0; s < x.length; s++) { - double residual = y[s] - x[s] * rc[0]; - rawDataUpdated[s] = residual; - } - - double meanUpdated = JSci.maths.ArrayMath.mean(rawDataUpdated); - double stdDevRatio = JSci.maths.ArrayMath.standardDeviation(rawDataUpdated) / Math.sqrt(varianceY); - for (int s = 0; s < x.length; s++) { - rawDataUpdated[s] -= meanUpdated; - rawDataUpdated[s] /= stdDevRatio; - rawDataUpdated[s] += meanY; - } - System.arraycopy(rawDataUpdated, 0, rawdata[probe], 0, x.length); - } - } - - // NOTE: this new code switches around columns and rows for the covariate matrix - private Pair, DoubleMatrixDataset> loadCovariateValues(String covariatesToRemove, DoubleMatrixDataset dataset) throws IOException { - System.out.println("- Removing covariates as defined in: " + covariatesToRemove); - TextFile covariates = new TextFile(covariatesToRemove, TextFile.R); - int numRows = covariates.countLines() - 1; // minus the header :) - int numCols = covariates.countCols(TextFile.tab) - 1; // minus the header's row identifier (if any) - - if (numRows == 0 || numCols == 0) { - System.err.println("Covariate file is empty, but no covariates found in file! Is your file format correct?"); - System.err.println("The program is expecting the following: tab separated, one covariate per row, one sample per column, with sample identifiers identical to your --in file."); - System.exit(0); - } else { - System.out.println("Covariate file has " + numRows + " rows and " + numCols + " columns"); - } - - - // first hash up which samples are in the dataset - HashMap samplesInDatasetIndex = new HashMap(); - String[] allSamplesInDataset = dataset.colObjects.toArray(new String[0]); - for (int i = 0; i < allSamplesInDataset.length; i++) { - samplesInDatasetIndex.put(allSamplesInDataset[i], i); - } - - // read the column names from the covariate file - // expect the samples on the columns - String[] elems = covariates.readLineElemsReturnReference(TextFile.tab); // header - - int ctr = 0; - boolean[] sampleInDatasetIncludedInCovariates = new boolean[dataset.colObjects.size()]; - ArrayList columnNames = new ArrayList(); - for (int i = 1; i < elems.length; i++) { - Integer index = samplesInDatasetIndex.get(elems[i]); - columnNames.add(elems[i]); - if (index != null) { - sampleInDatasetIncludedInCovariates[index] = true; - ctr++; - } - } - - // read the covariate names, expect them to be on the rows - ArrayList rowNames = new ArrayList(); - elems = covariates.readLineElemsReturnReference(TextFile.tab); // first line - while (elems != null) { - rowNames.add(elems[0]); - elems = covariates.readLineElemsReturnReference(TextFile.tab); - } - covariates.close(); - - boolean isTransposed = false; - if (ctr == 0) { - System.err.println("No matching samples detected between covariate file and dataset. Maybe your covariate file needs to be transposed? Will test that for you now:"); - for (String rowName : rowNames) { - Integer index = samplesInDatasetIndex.get(rowName); - if (index != null) { - sampleInDatasetIncludedInCovariates[index] = true; - ctr++; - } - } - - if (ctr == 0) { - System.err.println("Transposing the data does not seem to resolve the issue. Please check your sample identifiers."); - System.exit(0); - } else { - System.out.println("Transposing the covariate file reveals: " + ctr + " samples present."); - isTransposed = true; - - } - - - } + double[] rawDataUpdated = new double[x.length]; + for (int s = 0; s < x.length; s++) { + double residual = y[s] - x[s] * rc[0]; + rawDataUpdated[s] = residual; + } + + double meanUpdated = JSci.maths.ArrayMath.mean(rawDataUpdated); + double stdDevRatio = JSci.maths.ArrayMath.standardDeviation(rawDataUpdated) / Math.sqrt(varianceY); + for (int s = 0; s < x.length; s++) { + rawDataUpdated[s] -= meanUpdated; + rawDataUpdated[s] /= stdDevRatio; + rawDataUpdated[s] += meanY; + } + System.arraycopy(rawDataUpdated, 0, rawdata[probe], 0, x.length); + } + } + + // NOTE: this new code switches around columns and rows for the covariate matrix + private Pair, DoubleMatrixDataset> loadCovariateValues(String covariatesToRemove, DoubleMatrixDataset dataset) throws IOException { + System.out.println("- Removing covariates as defined in: " + covariatesToRemove); + TextFile covariates = new TextFile(covariatesToRemove, TextFile.R); + int numRows = covariates.countLines() - 1; // minus the header :) + int numCols = covariates.countCols(TextFile.tab) - 1; // minus the header's row identifier (if any) + + if (numRows == 0 || numCols == 0) { + System.err.println("Covariate file is empty, but no covariates found in file! Is your file format correct?"); + System.err.println("The program is expecting the following: tab separated, one covariate per row, one sample per column, with sample identifiers identical to your --in file."); + System.exit(0); + } else { + System.out.println("Covariate file has " + numRows + " rows and " + numCols + " columns"); + } + + + // first hash up which samples are in the dataset + HashMap samplesInDatasetIndex = new HashMap(); + String[] allSamplesInDataset = dataset.colObjects.toArray(new String[0]); + for (int i = 0; i < allSamplesInDataset.length; i++) { + samplesInDatasetIndex.put(allSamplesInDataset[i], i); + } + + // read the column names from the covariate file + // expect the samples on the columns + String[] elems = covariates.readLineElemsReturnReference(TextFile.tab); // header + + int ctr = 0; + boolean[] sampleInDatasetIncludedInCovariates = new boolean[dataset.colObjects.size()]; + ArrayList columnNames = new ArrayList(); + for (int i = 1; i < elems.length; i++) { + Integer index = samplesInDatasetIndex.get(elems[i]); + columnNames.add(elems[i]); + if (index != null) { + sampleInDatasetIncludedInCovariates[index] = true; + ctr++; + } + } + + // read the covariate names, expect them to be on the rows + ArrayList rowNames = new ArrayList(); + elems = covariates.readLineElemsReturnReference(TextFile.tab); // first line + while (elems != null) { + rowNames.add(elems[0]); + elems = covariates.readLineElemsReturnReference(TextFile.tab); + } + covariates.close(); + + boolean isTransposed = false; + if (ctr == 0) { + System.err.println("No matching samples detected between covariate file and dataset. Maybe your covariate file needs to be transposed? Will test that for you now:"); + for (String rowName : rowNames) { + Integer index = samplesInDatasetIndex.get(rowName); + if (index != null) { + sampleInDatasetIncludedInCovariates[index] = true; + ctr++; + } + } + + if (ctr == 0) { + System.err.println("Transposing the data does not seem to resolve the issue. Please check your sample identifiers."); + System.exit(0); + } else { + System.out.println("Transposing the covariate file reveals: " + ctr + " samples present."); + isTransposed = true; + + } + + + } // if (dataset.colObjects.size() != numSamples) { // System.out.println("Covariates loaded from: " + covariatesToRemove + ", but the number of samples does not correspond! " + numSamples + " in covariates file, " + dataset.colObjects.size() + " in dataset..."); // System.out.println("Please note that missing samples will be removed from your eventual corrected --in file."); // } - if (ctr < dataset.colObjects.size()) { - System.err.println("Covariates loaded from: " + covariatesToRemove + ", but not all samples present in covariates file! " + ctr + " present in covariates file, out of " + dataset.colObjects.size() + " in dataset..."); - System.out.println("Your dataset will be adjusted accordingly."); - } - int nrCovariates = numRows; - if (isTransposed) { - nrCovariates = numCols; - } - - // make matrix with equal sample size - double[][] covariateValues = new double[nrCovariates][dataset.colObjects.size()]; - for (int row = 0; row < covariateValues.length; row++) { - for (int col = 0; col < covariateValues[row].length; col++) { - covariateValues[row][col] = Double.NaN; - } - } - - int lineCtr = 0; - covariates.open(); - String[] headerElems = covariates.readLineElemsReturnReference(TextFile.tab); // header - elems = covariates.readLineElemsReturnReference(TextFile.tab); - while (elems != null) { - if (isTransposed) { - String sampleName = elems[0]; - Integer sampleIdInDataset = samplesInDatasetIndex.get(sampleName); - if (sampleIdInDataset != null) { - for (int i = 1; i < elems.length; i++) { - try { - covariateValues[i - 1][sampleIdInDataset] = Double.parseDouble(elems[i]); - } catch (NumberFormatException e) { + if (ctr < dataset.colObjects.size()) { + System.err.println("Covariates loaded from: " + covariatesToRemove + ", but not all samples present in covariates file! " + ctr + " present in covariates file, out of " + dataset.colObjects.size() + " in dataset..."); + System.out.println("Your dataset will be adjusted accordingly."); + } + int nrCovariates = numRows; + if (isTransposed) { + nrCovariates = numCols; + } + + // make matrix with equal sample size + double[][] covariateValues = new double[nrCovariates][dataset.colObjects.size()]; + for (int row = 0; row < covariateValues.length; row++) { + for (int col = 0; col < covariateValues[row].length; col++) { + covariateValues[row][col] = Double.NaN; + } + } + + int lineCtr = 0; + covariates.open(); + String[] headerElems = covariates.readLineElemsReturnReference(TextFile.tab); // header + elems = covariates.readLineElemsReturnReference(TextFile.tab); + while (elems != null) { + if (isTransposed) { + String sampleName = elems[0]; + Integer sampleIdInDataset = samplesInDatasetIndex.get(sampleName); + if (sampleIdInDataset != null) { + for (int i = 1; i < elems.length; i++) { + try { + covariateValues[i - 1][sampleIdInDataset] = Double.parseDouble(elems[i]); + } catch (NumberFormatException e) { // System.out.println("WARNING: " + elems[i] + " is not a numeric value! in " + covariatesToRemove + " at line: " + (lineCtr + 1) + "."); // covariateValues[i - 1][sampleIdInDataset] = Double.NaN; // sampleInDatasetIncludedInCovariates[sampleIdInDataset] = false; - } - } - } - } else { - for (int i = 1; i < elems.length; i++) { - String sampleName = headerElems[i]; - Integer sampleIdInDataset = samplesInDatasetIndex.get(sampleName); - if (sampleIdInDataset != null) { - try { - covariateValues[lineCtr][sampleIdInDataset] = Double.parseDouble(elems[i]); - } catch (NumberFormatException e) { + } + } + } + } else { + for (int i = 1; i < elems.length; i++) { + String sampleName = headerElems[i]; + Integer sampleIdInDataset = samplesInDatasetIndex.get(sampleName); + if (sampleIdInDataset != null) { + try { + covariateValues[lineCtr][sampleIdInDataset] = Double.parseDouble(elems[i]); + } catch (NumberFormatException e) { // System.out.println("WARNING: " + elems[i] + " is not a numeric value at line: " + (lineCtr + 1) + "\tcolumn: " + i); - } - } - } - } - elems = covariates.readLineElemsReturnReference(TextFile.tab); - lineCtr++; - } - covariates.close(); - - // investigate how many covariates there actually is data for. - int covariateCtr = 0; - boolean[] includeCovariate = new boolean[covariateValues.length]; - for (int row = 0; row < covariateValues.length; row++) { - int nrColsFilled = 0; - for (int col = 0; col < covariateValues[row].length; col++) { - if (!Double.isNaN(covariateValues[row][col])) { - nrColsFilled++; - } - } - - if (nrColsFilled == 0) { - // there's no data for this covariate.... - includeCovariate[row] = false; - } else { - includeCovariate[row] = true; - covariateCtr++; - } - } - - if (covariateCtr == 0) { - System.err.println("ERROR: none of your covariates seem to have valid numerical values.. Please check your covariate file."); - System.exit(0); - } else { - System.out.println("After removing covariates without data, your dataset will have " + covariateCtr + " covariates (out of: " + covariateValues.length + ") ."); - } - - ArrayList covariateNames = null; - if (isTransposed) { - covariateNames = columnNames; - } else { - covariateNames = rowNames; - } - - if (covariateCtr != covariateValues.length) { - // remove covariates with missing values - System.out.println("Removing covariates that have no data at all."); - double[][] newCovariateData = new double[covariateCtr][dataset.colObjects.size()]; - ArrayList newCovariateNames = new ArrayList(); - int newCovariateCTR = 0; - for (int row = 0; row < covariateValues.length; row++) { - if (includeCovariate[row]) { - newCovariateNames.add(covariateNames.get(row)); - - for (int col = 0; col < covariateValues[row].length; col++) { - newCovariateData[newCovariateCTR][col] = covariateValues[row][col]; - - // check whether we should include all samples, but don't remove yet: sync this with the expression/whatever dastaset - if (Double.isNaN(covariateValues[row][col])) { - sampleInDatasetIncludedInCovariates[col] = false; - } - } - newCovariateCTR++; - } else { - System.out.println(covariateNames.get(row) + " removed."); - } - } - - - nrCovariates = newCovariateCTR; - covariateValues = newCovariateData; - covariateNames = newCovariateNames; - } - System.out.println(""); - System.out.println("Remaining covariates: "); - for (String s : covariateNames) { - System.out.println(s); - } - System.out.println(""); - // investigate how many samples there actually is data for. - for (int row = 0; row < covariateValues.length; row++) { - for (int col = 0; col < covariateValues[row].length; col++) { - if (Double.isNaN(covariateValues[row][col])) { - sampleInDatasetIncludedInCovariates[col] = false; - } - } - } - - int sampleCtr = 0; - for (int q = 0; q < sampleInDatasetIncludedInCovariates.length; q++) { - if (sampleInDatasetIncludedInCovariates[q]) { - sampleCtr++; - } - } - - // remove samples that have a missing value for at least one covariate + } + } + } + } + elems = covariates.readLineElemsReturnReference(TextFile.tab); + lineCtr++; + } + covariates.close(); + + // investigate how many covariates there actually is data for. + int covariateCtr = 0; + boolean[] includeCovariate = new boolean[covariateValues.length]; + for (int row = 0; row < covariateValues.length; row++) { + int nrColsFilled = 0; + for (int col = 0; col < covariateValues[row].length; col++) { + if (!Double.isNaN(covariateValues[row][col])) { + nrColsFilled++; + } + } + + if (nrColsFilled == 0) { + // there's no data for this covariate.... + includeCovariate[row] = false; + } else { + includeCovariate[row] = true; + covariateCtr++; + } + } + + if (covariateCtr == 0) { + System.err.println("ERROR: none of your covariates seem to have valid numerical values.. Please check your covariate file."); + System.exit(0); + } else { + System.out.println("After removing covariates without data, your dataset will have " + covariateCtr + " covariates (out of: " + covariateValues.length + ") ."); + } + + ArrayList covariateNames = null; + if (isTransposed) { + covariateNames = columnNames; + } else { + covariateNames = rowNames; + } + + if (covariateCtr != covariateValues.length) { + // remove covariates with missing values + System.out.println("Removing covariates that have no data at all."); + double[][] newCovariateData = new double[covariateCtr][dataset.colObjects.size()]; + ArrayList newCovariateNames = new ArrayList(); + int newCovariateCTR = 0; + for (int row = 0; row < covariateValues.length; row++) { + if (includeCovariate[row]) { + newCovariateNames.add(covariateNames.get(row)); + + for (int col = 0; col < covariateValues[row].length; col++) { + newCovariateData[newCovariateCTR][col] = covariateValues[row][col]; + + // check whether we should include all samples, but don't remove yet: sync this with the expression/whatever dastaset + if (Double.isNaN(covariateValues[row][col])) { + sampleInDatasetIncludedInCovariates[col] = false; + } + } + newCovariateCTR++; + } else { + System.out.println(covariateNames.get(row) + " removed."); + } + } + + + nrCovariates = newCovariateCTR; + covariateValues = newCovariateData; + covariateNames = newCovariateNames; + } + System.out.println(""); + System.out.println("Remaining covariates: "); + for (String s : covariateNames) { + System.out.println(s); + } + System.out.println(""); + // investigate how many samples there actually is data for. + for (int row = 0; row < covariateValues.length; row++) { + for (int col = 0; col < covariateValues[row].length; col++) { + if (Double.isNaN(covariateValues[row][col])) { + sampleInDatasetIncludedInCovariates[col] = false; + } + } + } + + int sampleCtr = 0; + for (int q = 0; q < sampleInDatasetIncludedInCovariates.length; q++) { + if (sampleInDatasetIncludedInCovariates[q]) { + sampleCtr++; + } + } + + // remove samples that have a missing value for at least one covariate // if (sampleCtr == sampleInDatasetIncludedInCovariates.length) { // System.out.println("There were no missing values or samples in your covariate file. Sample size will remain unchanged."); // DoubleMatrixDataset covariateDataset = new DoubleMatrixDataset(covariateValues, dataset.rowObjects, covariateNames); // return new Pair, DoubleMatrixDataset>(covariateDataset, dataset); // } else { - System.out.println("Your covariate corrected dataset will have " + sampleCtr + " samples, after removing samples with missing covariate values."); - double[][] rawData = dataset.getRawData(); - double[][] newRawData = new double[rawData.length][sampleCtr]; - double[][] finalCovariateData = new double[nrCovariates][sampleCtr]; - ArrayList newColObjects = new ArrayList(); - - for (int col = 0; col < dataset.colObjects.size(); col++) { - if (sampleInDatasetIncludedInCovariates[col]) { - newColObjects.add(dataset.colObjects.get(col)); - } - } - - for (int row = 0; row < rawData.length; row++) { - int includedSampleCtr = 0; - for (int col = 0; col < dataset.colObjects.size(); col++) { - if (sampleInDatasetIncludedInCovariates[col]) { - // include sample - newRawData[row][includedSampleCtr] = rawData[row][col]; - includedSampleCtr++; - } - } - } - - for (int row = 0; row < covariateValues.length; row++) { - int includedCovariateSampleCtr = 0; - for (int col = 0; col < dataset.colObjects.size(); col++) { - // replace covariate data... - if (sampleInDatasetIncludedInCovariates[col]) { - finalCovariateData[row][includedCovariateSampleCtr] = covariateValues[row][col]; - includedCovariateSampleCtr++; - } - } - } - - DoubleMatrixDataset covariateDataset = new DoubleMatrixDataset(finalCovariateData, covariateNames, newColObjects); - covariateDataset.save(covariatesToRemove + "-asLoadedByNormalizer.txt"); - DoubleMatrixDataset newDataset = new DoubleMatrixDataset(newRawData, dataset.rowObjects, newColObjects); - newDataset.save(dataset.fileName + "-SampleSizeCorrectedForCovariates.txt"); - return new Pair, DoubleMatrixDataset>(covariateDataset, newDataset); + System.out.println("Your covariate corrected dataset will have " + sampleCtr + " samples, after removing samples with missing covariate values."); + double[][] rawData = dataset.getRawData(); + double[][] newRawData = new double[rawData.length][sampleCtr]; + double[][] finalCovariateData = new double[nrCovariates][sampleCtr]; + ArrayList newColObjects = new ArrayList(); + + for (int col = 0; col < dataset.colObjects.size(); col++) { + if (sampleInDatasetIncludedInCovariates[col]) { + newColObjects.add(dataset.colObjects.get(col)); + } + } + + for (int row = 0; row < rawData.length; row++) { + int includedSampleCtr = 0; + for (int col = 0; col < dataset.colObjects.size(); col++) { + if (sampleInDatasetIncludedInCovariates[col]) { + // include sample + newRawData[row][includedSampleCtr] = rawData[row][col]; + includedSampleCtr++; + } + } + } + + for (int row = 0; row < covariateValues.length; row++) { + int includedCovariateSampleCtr = 0; + for (int col = 0; col < dataset.colObjects.size(); col++) { + // replace covariate data... + if (sampleInDatasetIncludedInCovariates[col]) { + finalCovariateData[row][includedCovariateSampleCtr] = covariateValues[row][col]; + includedCovariateSampleCtr++; + } + } + } + + DoubleMatrixDataset covariateDataset = new DoubleMatrixDataset(finalCovariateData, covariateNames, newColObjects); + covariateDataset.save(covariatesToRemove + "-asLoadedByNormalizer.txt"); + DoubleMatrixDataset newDataset = new DoubleMatrixDataset(newRawData, dataset.rowObjects, newColObjects); + newDataset.save(dataset.fileName + "-SampleSizeCorrectedForCovariates.txt"); + return new Pair, DoubleMatrixDataset>(covariateDataset, newDataset); // } - } - - private String removeProbesWithZeroVariance(DoubleMatrixDataset dataset, String outputFileNamePrefix) throws IOException { - boolean[] dataHasZeroVariance = new boolean[dataset.nrRows]; - int nrRowsWithZeroVariance = 0; - for (int row = 0; row < dataset.nrRows; row++) { - double[] data = dataset.rawData[row]; - double var = JSci.maths.ArrayMath.variance(data); - if (var == 0d) { - System.out.println("Removing probe with zero variance: " + dataset.rowObjects.get(row) + " on line " + (row + 1)); - nrRowsWithZeroVariance++; - dataHasZeroVariance[row] = true; - } - } - - if (nrRowsWithZeroVariance > 0) { - int newNrRows = dataset.nrRows - nrRowsWithZeroVariance; - if (newNrRows == 0) { - System.err.println("ERROR: all probes have zero variance!"); - System.exit(-1); - } - - - double[][] newData = new double[newNrRows][dataset.nrCols]; - int ctr = 0; - ArrayList newRowHeader = new ArrayList(); - for (int row = 0; row < dataset.nrRows; row++) { - if (!dataHasZeroVariance[row]) { - newData[ctr] = dataset.rawData[row]; - newRowHeader.add(dataset.rowObjects.get(row)); - ctr++; - } - } - - dataset.rawData = newData; - dataset.rowObjects = newRowHeader; - dataset.recalculateHashMaps(); - String outputFileName = outputFileNamePrefix + ".ProbesWithZeroVarianceRemoved"; - dataset.save(outputFileName + ".txt.gz"); - return outputFileName; - } - - return outputFileNamePrefix; - } + } + + private String removeProbesWithZeroVariance(DoubleMatrixDataset dataset, String outputFileNamePrefix) throws IOException { + boolean[] dataHasZeroVariance = new boolean[dataset.nrRows]; + int nrRowsWithZeroVariance = 0; + for (int row = 0; row < dataset.nrRows; row++) { + double[] data = dataset.rawData[row]; + double var = JSci.maths.ArrayMath.variance(data); + if (var == 0d) { + System.out.println("Removing probe with zero variance: " + dataset.rowObjects.get(row) + " on line " + (row + 1)); + nrRowsWithZeroVariance++; + dataHasZeroVariance[row] = true; + } + } + + if (nrRowsWithZeroVariance > 0) { + int newNrRows = dataset.nrRows - nrRowsWithZeroVariance; + if (newNrRows == 0) { + System.err.println("ERROR: all probes have zero variance!"); + System.exit(-1); + } + + + double[][] newData = new double[newNrRows][dataset.nrCols]; + int ctr = 0; + ArrayList newRowHeader = new ArrayList(); + for (int row = 0; row < dataset.nrRows; row++) { + if (!dataHasZeroVariance[row]) { + newData[ctr] = dataset.rawData[row]; + newRowHeader.add(dataset.rowObjects.get(row)); + ctr++; + } + } + + dataset.rawData = newData; + dataset.rowObjects = newRowHeader; + dataset.recalculateHashMaps(); + String outputFileName = outputFileNamePrefix + ".ProbesWithZeroVarianceRemoved"; + dataset.save(outputFileName + ".txt.gz"); + return outputFileName; + } + + return outputFileNamePrefix; + } } diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/util/HiCTransQTLAnnotator.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/util/HiCTransQTLAnnotator.java new file mode 100644 index 000000000..f044c389e --- /dev/null +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/util/HiCTransQTLAnnotator.java @@ -0,0 +1,457 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package eqtlmappingpipeline.util; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashSet; +import java.util.regex.Pattern; +import org.apache.commons.lang3.StringUtils; +import umcg.genetica.containers.Pair; +import umcg.genetica.io.text.TextFile; +import umcg.genetica.io.trityper.EQTL; +import umcg.genetica.io.trityper.QTLTextFile; + +/** + * + * @author MarcJan + */ +class HiCTransQTLAnnotator { + + private static final Pattern SPLIT_TAB = Pattern.compile("\t"); + + public static void main(String[] args) throws IOException { + //"D:\\WebFolders\\OwnCloud\\AeroFS\\RP3_BIOS_Methylation\\meQTLs\\Trans_Pc22c_CisMeQTLc_meQTLs\\RegressedOut_CisEffects_New\\eQTLsFDR0.05-ProbeLevel_BsFiltered_And_Filtered.txt" + + String QTLfile = "D:\\WebFolders\\OwnCloud\\AeroFS\\RP3_BIOS_Methylation\\meQTLs\\Trans_Pc22c_CisMeQTLc_meQTLs\\RegressedOut_CisEffects_New\\eQTLsFDR0.05-ProbeLevel_BsFiltered&Filtered.txt"; + String proxyfile = "D:\\WebFolders\\OwnCloud\\AeroFS\\RP3_BIOS_Methylation\\meQTLs\\Trans_Pc22c_CisMeQTLc_meQTLs\\RegressedOut_CisEffects_New\\proxiesMeQTLSnps.txt"; + String QTLoutfile = "D:\\WebFolders\\OwnCloud\\AeroFS\\RP3_BIOS_Methylation\\meQTLs\\Trans_Pc22c_CisMeQTLc_meQTLs\\RegressedOut_CisEffects_New\\eQTLsFDR0.05-ProbeLevel_BsFiltered&Filtered_HiC_LD_annotated.txt"; + String folderHighC = "F:\\Contacts\\GM12878_combined_interchromosomal\\"; + String resolution = "1kb"; + String qualityCutOff = "E30"; //0 or E30 + String normMethod = null; //null / KRnorm / SQRTVCnorm / VCnorm + double minValueQuality = 0; + + boolean lowMemMode = true; + + if (!lowMemMode) { + addAnnotationToQTLOutput( + QTLfile, + proxyfile, + folderHighC, + resolution, + qualityCutOff, + normMethod, + minValueQuality, + QTLoutfile); + } else { + addAnnotationToQTLOutputLowMem( + QTLfile, + proxyfile, + folderHighC, + resolution, + qualityCutOff, + normMethod, + minValueQuality, + QTLoutfile); + } + } + + static void addAnnotationToQTLOutput(String in, String inProxies, String folderHighC, String resolution, String qualityCutOff, String normMethod, double minValue, String out) throws IOException { + QTLTextFile eqtlTextFile = new QTLTextFile(in, QTLTextFile.R); + + ArrayList qtls = eqtlTextFile.readList(); + + if (inProxies != null) { + qtls = includeProxyInfo(qtls, inProxies); + } + + HashMap>> contactBuffer = new HashMap>>(); + //Here we need to make a new Type to store the potentialy inflated files. + TextFile outWriter = new TextFile(out, TextFile.W); + for (EQTL eqtl : qtls) { + String chrProbe = String.valueOf(eqtl.getProbeChr()); + String chrSnp = String.valueOf(eqtl.getRsChr()); + +// System.out.println(chrProbe+"\t"+chrSnp); + if (chrProbe.equals(chrSnp)) { + //Here we need to check how to normalize and treat intra-chromosomal data. + continue; + } + + int posChrSmaller; + int posChrLarger; + + LinkedHashSet> interestRegions = null; + if (Integer.parseInt(chrProbe) < Integer.parseInt(chrSnp)) { + posChrSmaller = eqtl.getProbeChrPos(); + posChrLarger = eqtl.getRsChrPos(); + if (contactBuffer.containsKey("chr" + chrProbe + "_chr" + chrSnp)) { + interestRegions = contactBuffer.get("chr" + chrProbe + "_chr" + chrSnp); + } else { + String baseName = folderHighC + resolution + "_resolution_interchromosomal\\chr" + chrProbe + "_chr" + chrSnp + "\\MAPQG" + qualityCutOff; + String fileToReads = baseName + "\\chr" + chrProbe + "_" + chrSnp + "_1kb.RAWobserved"; +// System.out.println("Reading: " + fileToReads); + if (normMethod == null) { + interestRegions = readRawInterContactInformation(fileToReads, minValue); + } else { + interestRegions = readNormalizedInterContactInformation(fileToReads, baseName, normMethod, chrProbe, chrSnp, resolution, minValue); + } + contactBuffer.put("chr" + chrProbe + "_chr" + chrSnp, interestRegions); + } + } else { + posChrSmaller = eqtl.getRsChrPos(); + posChrLarger = eqtl.getProbeChrPos(); + if (contactBuffer.containsKey("chr" + chrSnp + "_chr" + chrProbe)) { + interestRegions = contactBuffer.get("chr" + chrSnp + "_chr" + chrProbe); + } else { + String baseName = folderHighC + resolution + "_resolution_interchromosomal\\chr" + chrSnp + "_chr" + chrProbe + "\\MAPQG" + qualityCutOff; + String fileToReads = baseName + "\\chr" + chrSnp + "_" + chrProbe + "_1kb.RAWobserved"; +// System.out.println("Reading: " + fileToReads); + if (normMethod == null) { + interestRegions = readRawInterContactInformation(fileToReads, minValue); + } else { + interestRegions = readNormalizedInterContactInformation(fileToReads, baseName, normMethod, chrSnp, chrProbe, resolution, minValue); + } + contactBuffer.put("chr" + chrSnp + "_chr" + chrProbe, interestRegions); + } + } + + if (determineContact(posChrSmaller, posChrLarger, interestRegions, getNumericResolution(resolution))) { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\tContact"); + } else { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\t-"); + } + } + outWriter.close(); + } + + static void addAnnotationToQTLOutputLowMem(String in, String inProxies, String folderHighC, String resolution, String qualityCutOff, String normMethod, double minValue, String out) throws IOException { + QTLTextFile eqtlTextFile = new QTLTextFile(in, QTLTextFile.R); + + ArrayList qtls = eqtlTextFile.readList(); + + if (inProxies != null) { + qtls = includeProxyInfo(qtls, inProxies); + } + + //Here we need to make a new Type to store the potentialy inflated files. + TextFile outWriter = new TextFile(out, TextFile.W); + for (EQTL eqtl : qtls) { + String chrProbe = String.valueOf(eqtl.getProbeChr()); + String chrSnp = String.valueOf(eqtl.getRsChr()); + +// System.out.println(chrProbe+"\t"+chrSnp); + if (chrProbe.equals(chrSnp)) { + //Here we need to check how to normalize and treat intra-chromosomal data. + continue; + } + + int posChrSmaller; + int posChrLarger; + + if (Integer.parseInt(chrProbe) < Integer.parseInt(chrSnp)) { + posChrSmaller = eqtl.getProbeChrPos(); + posChrLarger = eqtl.getRsChrPos(); + String baseName = folderHighC + resolution + "_resolution_interchromosomal\\chr" + chrProbe + "_chr" + chrSnp + "\\MAPQG" + qualityCutOff; + String fileToReads = baseName + "\\chr" + chrProbe + "_" + chrSnp + "_1kb.RAWobserved"; +// System.out.println("Reading: " + fileToReads); + if (normMethod == null) { + if (readRawInterContactInformationLowMem(fileToReads, minValue, posChrSmaller, posChrLarger, resolution)) { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\tContact"); + } else { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\t-"); + } + } else { + if (readNormalizedInterContactInformationLowMem(fileToReads, baseName, normMethod, chrProbe, chrSnp, posChrSmaller, posChrLarger, resolution, minValue)) { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\tContact"); + } else { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\t-"); + } + } + + } else { + posChrSmaller = eqtl.getRsChrPos(); + posChrLarger = eqtl.getProbeChrPos(); + + String baseName = folderHighC + resolution + "_resolution_interchromosomal\\chr" + chrSnp + "_chr" + chrProbe + "\\MAPQG" + qualityCutOff; + String fileToReads = baseName + "\\chr" + chrSnp + "_" + chrProbe + "_1kb.RAWobserved"; +// System.out.println("Reading: " + fileToReads); + if (normMethod == null) { + if (readRawInterContactInformationLowMem(fileToReads, minValue, posChrSmaller, posChrLarger, resolution)) { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\tContact"); + } else { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\t-"); + } + } else { + if (readNormalizedInterContactInformationLowMem(fileToReads, baseName, normMethod, chrSnp, chrProbe, posChrSmaller, posChrLarger, resolution, minValue)) { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\tContact"); + } else { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\t-"); + } + } + } + } + outWriter.close(); + } + + private static ArrayList includeProxyInfo(ArrayList qtls, String inProxies) throws IOException { + ArrayList newQtlList = new ArrayList(); + + TextFile readProxies = new TextFile(inProxies, TextFile.R); + + String line = readProxies.readLine(); +// System.out.println(line); + while ((line = readProxies.readLine()) != null) { +// System.out.println(line); + String[] lineParts = SPLIT_TAB.split(line); + String chr = lineParts[4]; + int chrPos = Integer.parseInt(lineParts[5]); + int chrNewPos = Integer.parseInt(lineParts[8]); + for (EQTL e : qtls) { + if (String.valueOf(e.getRsChr()).equals(chr) && e.getRsChrPos() == chrPos) { + EQTL newQtl = new EQTL(); + newQtl.setProbe(e.getProbe()); + newQtl.setProbeChr(e.getProbeChr()); + newQtl.setProbeChrPos(e.getProbeChrPos()); + + newQtl.setRsName(e.getRsName() + "-" + lineParts[1]); + newQtl.setRsChr(e.getRsChr()); + newQtl.setRsChrPos(chrNewPos); + newQtlList.add(newQtl); + } + } + } + + for (EQTL e : qtls) { + newQtlList.add(e); + } + + return newQtlList; + } + + //For example, here is a line from the 5kb chr1 MAPQGE30 raw observed contact matrix (GM12878_combined/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved): +//40000000 40100000 59.0 + private static LinkedHashSet> readRawInterContactInformation(String fileToReads, double minContactValue) throws IOException { + LinkedHashSet> chrContactInfo = new LinkedHashSet>(); + + BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(fileToReads), "UTF-8")); + + String row; + + while ((row = input.readLine()) != null) { + String[] parts = StringUtils.split(row, '\t'); + + int posChr1 = Integer.parseInt(parts[0]); + int posChr2 = Integer.parseInt(parts[1]); + double contact = Double.parseDouble(parts[2]); + if (contact >= minContactValue) { + chrContactInfo.add(new Pair(posChr1, posChr2)); + } + } + input.close(); + return chrContactInfo; + + } + + //For example, here is a line from the 5kb chr1 MAPQGE30 raw observed contact matrix (GM12878_combined/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved): + //40000000 40100000 59.0 + //To normalize this entry using the KR normalization vector, one would divide 59.0 by the 8001st line ((40000000/5000)+1=8001) and the 8021st line ((40100000/5000)+1=8021) + //of GM12878_combined/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.KRnorm. The 8001st line of the KR norm file is 1.2988778370674694; + //The 8021st line of the KR norm file is 1.6080499717941548. So the corresponding KR normalized entry for the entry above is 59.0/(1.2988778370674694*1.6080499717941548) + //or 28.24776973966101. + //If the KR normalization vector file is empty or all NaNs, then the KR algorithm didn�t converge on that particular matrix (likely due to sparsity of the matrix). + private static LinkedHashSet> readNormalizedInterContactInformation(String fileToRead, String baseName, String normMethod, String chrSmaller, String chrLarger, String resolution, double minContactValue) throws IOException { + + //ReadIn normalization chr1 + TextFile inputNormChr1 = new TextFile(baseName + "\\chr" + chrSmaller + "_" + resolution + "." + normMethod, TextFile.R); + ArrayList normFactorSmallerChr = inputNormChr1.readAsArrayList(); + inputNormChr1.close(); + + //ReadIn normalization chr2 + TextFile inputNormChr2 = new TextFile(baseName + "\\chr" + chrLarger + "_" + resolution + "." + normMethod, TextFile.R); + ArrayList normFactorLargerChr = inputNormChr2.readAsArrayList(); + + inputNormChr2.close(); + + LinkedHashSet> chrContactInfo = new LinkedHashSet>(); + + BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(fileToRead), "UTF-8")); + + String row; + + while ((row = input.readLine()) != null) { + String[] parts = StringUtils.split(row, '\t'); + + int posChr1 = Integer.parseInt(parts[0]); + int posChr2 = Integer.parseInt(parts[1]); + + String factor1Base = normFactorSmallerChr.get((posChr1 / getNumericResolution(resolution)) + 1); + String factor2Base = normFactorLargerChr.get((posChr2 / getNumericResolution(resolution)) + 1); + + double factor1; + double factor2; + + if (StringUtils.isNumeric(factor1Base) && StringUtils.isNumeric(factor2Base)) { + factor1 = Double.parseDouble(factor1Base); + factor2 = Double.parseDouble(factor2Base); + + double contact = Double.parseDouble(parts[2]) / (factor1 * factor2); + if (contact >= minContactValue) { + chrContactInfo.add(new Pair(posChr1, posChr2)); + } + + } + } + input.close(); + return chrContactInfo; + } + + private static boolean determineContact(int posChrSmaller, int posChrLarger, LinkedHashSet> interestRegions, int resolution) { + //Determine bin1 + //Starts counting at 0-resulution + int bin1 = posChrSmaller - (posChrSmaller % resolution); + + //Determine bin2 + int bin2 = posChrLarger - (posChrLarger % resolution); + + //See if bin1 and bin2 are in the file. + boolean contact = false; + + for (Pair entry : interestRegions) { + if (entry.getLeft() == bin1) { + if (entry.getRight() == bin2) { + contact = true; + break; + } else if (entry.getRight() > bin2) { + break; + } + } else if (entry.getLeft() > bin1) { + break; + } + } + return contact; + } + + private static boolean readRawInterContactInformationLowMem(String fileToReads, double minValue, int posChrSmaller, int posChrLarger, String resolution) throws IOException { + //Determine bin1 + //Starts counting at 0-resulution + int bin1 = posChrSmaller - (posChrSmaller % getNumericResolution(resolution)); + + //Determine bin2 + int bin2 = posChrLarger - (posChrLarger % getNumericResolution(resolution)); + + //See if bin1 and bin2 are in the file. + boolean contactFound = false; + + BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(fileToReads), "UTF-8")); + + String row; + + while ((row = input.readLine()) != null) { + String[] parts = StringUtils.split(row, '\t'); + + int posChr1 = Integer.parseInt(parts[0]); + if (posChr1 == bin1) { + int posChr2 = Integer.parseInt(parts[1]); + if (posChr2 == bin2) { + double contact = Double.parseDouble(parts[2]); + if (contact >= minValue) { + contactFound = true; + } + break; + } else if (posChr2 > bin2) { + break; + } + } else if (posChr1 > bin1) { + break; + } + + } + input.close(); + return contactFound; + } + + private static boolean readNormalizedInterContactInformationLowMem(String fileToRead, String baseName, String normMethod, String chrSmaller, String chrLarger, int posChrSmaller, int posChrLarger, String resolution, double minValue) throws IOException { + //Determine bin1 + //Starts counting at 0-resulution + int bin1 = posChrSmaller - (posChrSmaller % getNumericResolution(resolution)); + + //Determine bin2 + int bin2 = posChrLarger - (posChrLarger % getNumericResolution(resolution)); + + //ReadIn normalization chr1 + TextFile inputNormChr1 = new TextFile(baseName + "\\chr" + chrSmaller + "_" + resolution + "." + normMethod, TextFile.R); + ArrayList normFactorSmallerChr = inputNormChr1.readAsArrayList(); + inputNormChr1.close(); + + //ReadIn normalization chr2 + TextFile inputNormChr2 = new TextFile(baseName + "\\chr" + chrLarger + "_" + resolution + "." + normMethod, TextFile.R); + ArrayList normFactorLargerChr = inputNormChr2.readAsArrayList(); + + inputNormChr2.close(); + + LinkedHashSet> chrContactInfo = new LinkedHashSet>(); + + BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(fileToRead), "UTF-8")); + + String row; + + //See if bin1 and bin2 are in the file. + boolean contactFound = false; + + while ((row = input.readLine()) != null) { + String[] parts = StringUtils.split(row, '\t'); + + int posChr1 = Integer.parseInt(parts[0]); + if (posChr1 == bin1) { + int posChr2 = Integer.parseInt(parts[1]); + if (posChr2 == bin2) { + + String factor1Base = normFactorSmallerChr.get((posChr1 / getNumericResolution(resolution)) + 1); + String factor2Base = normFactorLargerChr.get((posChr2 / getNumericResolution(resolution)) + 1); + + double factor1; + double factor2; + + if (StringUtils.isNumeric(factor1Base) && StringUtils.isNumeric(factor2Base)) { + factor1 = Double.parseDouble(factor1Base); + factor2 = Double.parseDouble(factor2Base); + + double contact = Double.parseDouble(parts[2]) / (factor1 * factor2); + if (contact >= minValue) { + contactFound = true; + } + break; + } + + } else if (posChr2 > bin2) { + break; + } + } else if (posChr1 > bin1) { + break; + } + + } + input.close(); + return contactFound; + } + + private static int getNumericResolution(String resolution) { + if (resolution.equals("1kb")) { + return 1000; + } else if (resolution.equals("5kb")) { + return 5000; + } else { + System.out.println("\nError in resolution setting!\n"); + System.exit(-1); + } + return 0; + } +} \ No newline at end of file diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/util/HighCTransQTLAnnotator.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/util/HighCTransQTLAnnotator.java new file mode 100644 index 000000000..066f68d07 --- /dev/null +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/util/HighCTransQTLAnnotator.java @@ -0,0 +1,312 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package eqtlmappingpipeline.util; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.regex.Pattern; +import org.apache.commons.lang3.StringUtils; +import umcg.genetica.io.Gpio; +import umcg.genetica.io.text.TextFile; +import umcg.genetica.io.trityper.EQTL; +import umcg.genetica.io.trityper.QTLTextFile; + +/** + * + * @author MarcJan + */ +class HighCTransQTLAnnotator { + + //ToDo Tiedy up the code. To fit the objects made for sorting better. + //Remove hihg-memory part! + private static final Pattern SPLIT_TAB = Pattern.compile("\t"); + + public static void main(String[] args) throws IOException { + + String QTLfile = "D:\\WebFolders\\OwnCloud\\AeroFS\\RP3_BIOS_Methylation\\meQTLs\\Trans_Pc22c_CisMeQTLc_meQTLs\\RegressedOut_CisEffects_New\\eQTLsFDR0.05-ProbeLevel_BsFiltered&Filtered.txt"; + String proxyfile = "D:\\WebFolders\\OwnCloud\\AeroFS\\RP3_BIOS_Methylation\\meQTLs\\Trans_Pc22c_CisMeQTLc_meQTLs\\RegressedOut_CisEffects_New\\proxiesMeQTLSnps.txt"; +// String proxyfile = null; + String QTLoutfile = "D:\\WebFolders\\OwnCloud\\AeroFS\\RP3_BIOS_Methylation\\meQTLs\\Trans_Pc22c_CisMeQTLc_meQTLs\\RegressedOut_CisEffects_New\\eQTLsFDR0.05-ProbeLevel_BsFiltered&Filtered_HiC_LD_annotated.txt"; + String folderHighC = "F:\\Contacts\\GM12878_combined_interchromosomal\\"; + String resolution = "1kb"; + String qualityCutOff = "E30"; //0 or E30 + String normMethod = null; //null / KRnorm / SQRTVCnorm / VCnorm + double minValueQuality = 0; + + addAnnotationToQTLOutput( + QTLfile, + proxyfile, + folderHighC, + resolution, + qualityCutOff, + normMethod, + minValueQuality, + QTLoutfile); + + } + + static void addAnnotationToQTLOutput(String in, String inProxies, String folderHighC, String resolution, String qualityCutOff, String normMethod, double minValue, String out) throws IOException { + QTLTextFile eqtlTextFile = new QTLTextFile(in, QTLTextFile.R); + + ArrayList qtls = eqtlTextFile.readList(); + + if (inProxies != null) { + qtls = includeProxyInfo(qtls, inProxies); + } + + //Here we need to make a new Type to store the potentialy inflated files. + TextFile outWriter = new TextFile(out, TextFile.W); + for (EQTL eqtl : qtls) { + String chrProbe = String.valueOf(eqtl.getProbeChr()); + String chrSnp = String.valueOf(eqtl.getRsChr()); + +// System.out.println(chrProbe+"\t"+chrSnp); + if (chrProbe.equals(chrSnp)) { + //Here we need to check how to normalize and treat intra-chromosomal data. + continue; + } + + int posChrSmaller; + int posChrLarger; + String ChrSmaller; + String ChrLarger; + int bin1; + int bin2; + String baseName; + String fileToReads; + + HashMap contactBuffer = new HashMap(); + + if (Integer.parseInt(chrProbe) < Integer.parseInt(chrSnp)) { + posChrSmaller = eqtl.getProbeChrPos(); + posChrLarger = eqtl.getRsChrPos(); + + ChrSmaller = chrProbe; + ChrLarger = chrSnp; + + //Determine bin1 + //Startscounting at 0-resulution + bin1 = posChrSmaller - (posChrSmaller % getNumericResolution(resolution)); + // System.out.println("\t"+bin1); + //Determine bin2 + bin2 = posChrLarger - (posChrLarger % getNumericResolution(resolution)); + + baseName = folderHighC + resolution + "_resolution_interchromosomal\\chr" + chrProbe + "_chr" + chrSnp + "\\MAPQG" + qualityCutOff; + fileToReads = baseName + "\\chr" + chrProbe + "_" + chrSnp + "_" + resolution + ".RAWobserved"; +// System.out.println("Reading: " + fileToReads); + + } else { + posChrSmaller = eqtl.getRsChrPos(); + posChrLarger = eqtl.getProbeChrPos(); + + ChrSmaller = chrSnp; + ChrLarger = chrProbe; + + bin1 = posChrSmaller - (posChrSmaller % getNumericResolution(resolution)); + // System.out.println("\t"+bin1); + //Determine bin2 + bin2 = posChrLarger - (posChrLarger % getNumericResolution(resolution)); + + baseName = folderHighC + resolution + "_resolution_interchromosomal\\chr" + chrSnp + "_chr" + chrProbe + "\\MAPQG" + qualityCutOff; + fileToReads = baseName + "\\chr" + chrSnp + "_" + chrProbe + "_" + resolution + ".RAWobserved"; +// System.out.println("Reading: " + fileToReads); + + } + + if (normMethod == null) { + if (contactBuffer.containsKey(ChrSmaller + "_" + ChrLarger + "_" + bin1 + "_" + bin2)) { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\t" + contactBuffer.get(ChrSmaller + "_" + ChrLarger + "_" + bin1 + "_" + bin2)); + } else { + if (readRawInterContactInformation(fileToReads, minValue, bin1, bin2)) { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\tContact"); + contactBuffer.put(ChrSmaller + "_" + ChrLarger + "_" + bin1 + "_" + bin2, "Contact"); + } else { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\t-"); + contactBuffer.put(ChrSmaller + "_" + ChrLarger + "_" + bin1 + "_" + bin2, "-"); + } + } + } else { + if (contactBuffer.containsKey(ChrSmaller + "_" + ChrLarger + "_" + bin1 + "_" + bin2)) { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\t" + contactBuffer.get(ChrSmaller + "_" + ChrLarger + "_" + bin1 + "_" + bin2)); + } else { + if (readNormalizedInterContactInformation(fileToReads, baseName, normMethod, ChrSmaller, ChrLarger, posChrSmaller, posChrLarger, resolution, minValue)) { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\tContact"); + contactBuffer.put(ChrSmaller + "_" + ChrLarger + "_" + bin1 + "_" + bin2, "Contact"); + } else { + outWriter.writeln(eqtl.getRsName() + "\t" + eqtl.getProbe() + "\t-"); + contactBuffer.put(ChrSmaller + "_" + ChrLarger + "_" + bin1 + "_" + bin2, "-"); + } + } + } + + } + outWriter.close(); + } + + private static ArrayList includeProxyInfo(ArrayList qtls, String inProxies) throws IOException { + ArrayList newQtlList = new ArrayList(); + + TextFile readProxies = new TextFile(inProxies, TextFile.R); + + String line = readProxies.readLine(); +// System.out.println(line); + while ((line = readProxies.readLine()) != null) { +// System.out.println(line); + String[] lineParts = SPLIT_TAB.split(line); + String chr = lineParts[4]; + int chrPos = Integer.parseInt(lineParts[5]); + int chrNewPos = Integer.parseInt(lineParts[8]); + for (EQTL e : qtls) { + if (String.valueOf(e.getRsChr()).equals(chr) && e.getRsChrPos() == chrPos) { + EQTL newQtl = new EQTL(); + newQtl.setProbe(e.getProbe()); + newQtl.setProbeChr(e.getProbeChr()); + newQtl.setProbeChrPos(e.getProbeChrPos()); + + newQtl.setRsName(e.getRsName() + "-" + lineParts[1]); + newQtl.setRsChr(e.getRsChr()); + newQtl.setRsChrPos(chrNewPos); + newQtlList.add(newQtl); + } + } + } + + for (EQTL e : qtls) { + newQtlList.add(e); + } + + return newQtlList; + } + + //For example, here is a line from the 5kb chr1 MAPQGE30 raw observed contact matrix (GM12878_combined/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved): +//40000000 40100000 59.0 + private static boolean readRawInterContactInformation(String fileToReads, double minValue, int bin1, int bin2) throws IOException { +// System.out.println("\t\t"+fileToReads); +// System.out.println("\t"+bin2); + //See if bin1 and bin2 are in the file. + boolean contactFound = false; + + //Check if sorted version is available + //If not make sorted available. + if (!Gpio.exists(fileToReads + ".sorted")) { + umcg.genetica.io.chrContacts.SortInterChrContacts.readNonSortedWriteSorted(fileToReads, fileToReads + ".sorted"); + } + + BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(fileToReads + ".sorted"), "UTF-8")); + + String row; + + while ((row = input.readLine()) != null) { + String[] parts = StringUtils.split(row, '\t'); +// System.out.println(row); + int posChr1 = Integer.parseInt(parts[0]); + if (posChr1 == bin1) { + int posChr2 = Integer.parseInt(parts[1]); + if (posChr2 == bin2) { + double contact = Double.parseDouble(parts[2]); + if (contact >= minValue) { + contactFound = true; + } + break; + } else if (posChr2 > bin2) { + break; + } + } else if (posChr1 > bin1) { + break; + } + } + input.close(); + return contactFound; + } + + //For example, here is a line from the 5kb chr1 MAPQGE30 raw observed contact matrix (GM12878_combined/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved): + //40000000 40100000 59.0 + //To normalize this entry using the KR normalization vector, one would divide 59.0 by the 8001st line ((40000000/5000)+1=8001) and the 8021st line ((40100000/5000)+1=8021) + //of GM12878_combined/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.KRnorm. The 8001st line of the KR norm file is 1.2988778370674694; + //The 8021st line of the KR norm file is 1.6080499717941548. So the corresponding KR normalized entry for the entry above is 59.0/(1.2988778370674694*1.6080499717941548) + //or 28.24776973966101. + //If the KR normalization vector file is empty or all NaNs, then the KR algorithm didn’t converge on that particular matrix (likely due to sparsity of the matrix). + private static boolean readNormalizedInterContactInformation(String fileToRead, String baseName, String normMethod, String chrSmaller, String chrLarger, int posChrSmaller, int posChrLarger, String resolution, double minValue) throws IOException { + //Determine bin1 + //Starts counting at 0-resulution + int bin1 = posChrSmaller - (posChrSmaller % getNumericResolution(resolution)); + + //Determine bin2 + int bin2 = posChrLarger - (posChrLarger % getNumericResolution(resolution)); + + //ReadIn normalization chr1 + TextFile inputNormChr1 = new TextFile(baseName + "\\chr" + chrSmaller + "_" + resolution + "." + normMethod, TextFile.R); + ArrayList normFactorSmallerChr = inputNormChr1.readAsArrayList(); + inputNormChr1.close(); + + //ReadIn normalization chr2 + TextFile inputNormChr2 = new TextFile(baseName + "\\chr" + chrLarger + "_" + resolution + "." + normMethod, TextFile.R); + ArrayList normFactorLargerChr = inputNormChr2.readAsArrayList(); + + inputNormChr2.close(); + + if (!Gpio.exists(fileToRead + ".sorted")) { + umcg.genetica.io.chrContacts.SortInterChrContacts.readNonSortedWriteSorted(fileToRead, fileToRead + ".sorted"); + } + + BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(fileToRead + ".sorted"), "UTF-8")); + + String row; + + //See if bin1 and bin2 are in the file. + boolean contactFound = false; + + while ((row = input.readLine()) != null) { + String[] parts = StringUtils.split(row, '\t'); + + int posChr1 = Integer.parseInt(parts[0]); + if (posChr1 == bin1) { + int posChr2 = Integer.parseInt(parts[1]); + if (posChr2 == bin2) { + + String factor1Base = normFactorSmallerChr.get((posChr1 / getNumericResolution(resolution)) + 1); + String factor2Base = normFactorLargerChr.get((posChr2 / getNumericResolution(resolution)) + 1); + + double factor1; + double factor2; + + if (StringUtils.isNumeric(factor1Base) && StringUtils.isNumeric(factor2Base)) { + factor1 = Double.parseDouble(factor1Base); + factor2 = Double.parseDouble(factor2Base); + + double contact = Double.parseDouble(parts[2]) / (factor1 * factor2); + if (contact >= minValue) { + contactFound = true; + } + break; + } + + } else if (posChr2 > bin2) { + break; + } + } else if (posChr1 > bin1) { + break; + } + } + input.close(); + return contactFound; + } + + private static int getNumericResolution(String resolution) { + if (resolution.equals("1kb")) { + return 1000; + } else if (resolution.equals("5kb")) { + return 5000; + } else { + System.out.println("\nError in resolution setting!\n"); + System.exit(-1); + } + return 0; + } +} diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/util/QTLAnnotator.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/util/QTLAnnotator.java index fafe55432..ade926b89 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/util/QTLAnnotator.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/util/QTLAnnotator.java @@ -43,29 +43,47 @@ public static void main(String[] args) throws IOException { // "1;1;1;10;1;11;1;4;1;4;1;4", "snp;probe;probe;probe;probe;probe", "D:\\UMCG\\ProbeMapping\\Info\\V70\\gencode.v15.annotation.gtf.gz", // "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\meQTLs\\Cis_Pc22c_meQTLs\\Primary\\eQTLProbesFDR0.05-ProbeLevel_ldDrivenEffectsRemoved-ExtendedInfo.txt"); - addAnnotationToQTLOutput( +// addAnnotationToQTLOutput( +// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\eQTMs\\Optimal_PC_and_QTL_Corrected\\RP3_0.25MB_TSS_extendedCis_eQTMs_2015\\eQTLSNPsFDR0.05-SNPLevel.txt", +// "D:\\UMCG\\Methylation_GPL13534\\annotationFile\\Illumina450K_MQtlMappingFile_Extensive.txt;D:\\UMCG\\Data\\RP3_RNA_Seq\\annotation_geneIds+overlapping_v71_cut.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\Annotation450k_AdditionMJ_v5.txt.gz;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\CODAM_NTR_LLS_LLD_RS_BBMRI_450K_var_mean_median.txt", +// "1;8-9-10-11-12-13-14;1;4-5;0;17-18-20-49-50-51-52-53-54-55-56-57-58-59-60-61-62-63-64-65-66-67-68-70-71-81-133-134-141-142-176-177-178-179-180-181-182-183-184-185-186-187-188-189-190-191-148-149-150-151-152-153-154-155-156-157-158-159-160-161-162-163-164-165-166-167-168-169-170-171-172-173-174-175-192-193-194-195-196-197-198;0;1-2-3-4", "snp;probe;snp;snp", null, +// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\eQTMs\\Optimal_PC_and_QTL_Corrected\\RP3_0.25MB_TSS_extendedCis_eQTMs_2015\\eQTLSNPsFDR0.05-SNPLevel.txt-ExtendedInfo5.txt"); + + addAnnotationToQTLOutput( "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\eQTMs\\Optimal_PC_and_QTL_Corrected\\RP3_0.25MB_TSS_extendedCis_eQTMs_2015\\eQTLSNPsFDR0.05-SNPLevel.txt", - "D:\\UMCG\\Methylation_GPL13534\\annotationFile\\Illumina450K_MQtlMappingFile_Extensive.txt;D:\\UMCG\\Data\\RP3_RNA_Seq\\annotation_geneIds+overlapping_v71_cut.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\Annotation450k_AdditionMJ_v5.txt.gz;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\CODAM_NTR_LLS_LLD_RS_BBMRI_450K_var_mean_median.txt", - "1;8-9-10-11-12-13-14;1;4-5;0;17-18-20-49-50-51-52-53-54-55-56-57-58-59-60-61-62-63-64-65-66-67-68-70-71-81-133-134-141-142-176-177-178-179-180-181-182-183-184-185-186-187-188-189-190-191-148-149-150-151-152-153-154-155-156-157-158-159-160-161-162-163-164-165-166-167-168-169-170-171-172-173-174-175-192-193-194-195-196-197-198;0;1-2-3-4", "snp;probe;snp;snp", null, - "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\eQTMs\\Optimal_PC_and_QTL_Corrected\\RP3_0.25MB_TSS_extendedCis_eQTMs_2015\\eQTLSNPsFDR0.05-SNPLevel.txt-ExtendedInfo5.txt"); + "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\Annotation450k_AdditionMJ_v10.txt.gz;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\GC_ContentSuroundingProbes_full.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\statisticsTMM_exprssion.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\GC_ContentSurounding_RP3Genes.txt;D:\\UMCG\\Data\\RP3_RNA_Seq\\annotation_geneIds+overlapping_v71_cut.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\CODAM_NTR_LLS_LLD_RS_BBMRI_450K_var_mean_median.txt", + "0;8-9-10-11-12-13-14-15-16-17-18-19-20-21-22-23-24-25-26-27-28-29-30-31-32-33-34-35-36-37-38-39-40-41-42-43-44-45-46-47-48-49-50-51-52-53-54-55-56-57-58-59-60-61-62-63-64-65-66-67-68-69-70-71-72-73-74-75-76-77-78-79-80-81-82-83-84-85-86-87-88-89-90-91-92-93-94-95-96-97-98-99-100-101-102-103-104-105-106-107-108-109-110-111-112-113-114-115-116-117-118-119-120-121-122-123-124-125-126-127-128-129-130-131-132-133-134-135-136-137-138-139-140-141-142-143-144-145-146-147-148-149-150-151-152-153-154-155-156-157-158-159-160-161-162-163-164-165-166-167-168-169-170-171-172-173-174-175-176-177-178-179-180-181-182-183-184-185-186-187-188-189-190-191-192-193-194-195-196-197-198-199-200-201-202-203-204-205-206-207-208-209-210-211-212-213-214-215-216-217-218-219-220-221-222-223-224-225-226-227-228-229-230-231-232-233-234-235-236-237-238-239-240-241-242-243-244-245-246-247-248-249-250-251-252-253-254-255-256-257-258-259-260-261-262-263-264-265-266-267-268-269-270-271-272-273-274-275-276-277-278-279-280-281-282-283-284-285-286-287-288-289-290-291-292-293-294-295-296-297-298-299-300-301-302-303-304-305-306-307-308-309-310-311-312-313-314-315-316-317-318-319-320-321-322-323-324-325-326-327-328-329-330-331-332-333-334-335-336-337-338-339-340-341-342-343-344-345-346-347-348-349-350-351-352-353-354-355-356-357;0;1-2-3-4-5-6-7-8-9-10-11-12-13-14-15;0;1-2-3-4;0;1-2-3-4-5-6-7-8-9-10-11-12-13-14-15;1;4-5;0;1-2-3-4", "snp;snp;probe;probe;probe;snp", null, + "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\eQTMs\\Optimal_PC_and_QTL_Corrected\\RP3_0.25MB_TSS_extendedCis_eQTMs_2015\\eQTLSNPsFDR0.05-SNPLevel_ExtendedInfTMP.txt"); // - addAnnotationToQTLOutput( - "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\eQTMs\\QTLCorrected\\RP3_0.25MB_TSS_extendedCis_eQTMs_2015\\eQTLSNPsFDR0.05-SNPLevel.txt", - "D:\\UMCG\\Methylation_GPL13534\\annotationFile\\Illumina450K_MQtlMappingFile_Extensive.txt;D:\\UMCG\\Data\\RP3_RNA_Seq\\annotation_geneIds+overlapping_v71_cut.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\Annotation450k_AdditionMJ_v5.txt.gz;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\CODAM_NTR_LLS_LLD_RS_BBMRI_450K_var_mean_median.txt", - "1;8-9-10-11-12-13-14;1;4-5;0;17-18-20-49-50-51-52-53-54-55-56-57-58-59-60-61-62-63-64-65-66-67-68-70-71-81-133-134-141-142-176-177-178-179-180-181-182-183-184-185-186-187-188-189-190-191-148-149-150-151-152-153-154-155-156-157-158-159-160-161-162-163-164-165-166-167-168-169-170-171-172-173-174-175-192-193-194-195-196-197-198;0;1-2-3-4", "snp;probe;snp;snp", null, - "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\eQTMs\\QTLCorrected\\RP3_0.25MB_TSS_extendedCis_eQTMs_2015\\eQTLSNPsFDR0.05-SNPLevel_ExtendedInfo4.txt"); +// addAnnotationToQTLOutput( +// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\eQTMs\\Optimal_PC_and_QTL_Corrected\\RP3_0.25MB_TSS_extendedCis_eQTMs_2015\\eQTLSNPsFDR0.05-SNPLevel.txt", +// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\Annotation450k_AdditionMJ_v10_13BM.txt.gz;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\GC_ContentSuroundingProbes_full.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\statisticsTMM_exprssion.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\GC_ContentSurounding_RP3Genes.txt;D:\\UMCG\\Data\\RP3_RNA_Seq\\annotation_geneIds+overlapping_v71_cut.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\CODAM_NTR_LLS_LLD_RS_BBMRI_450K_var_mean_median.txt", +// "0;8-9-10-11-12-13-14-15-16-17-18-175-176-177-178-179-180-181-182-183-184-185-186-187-188-189-190-191-192-193-194-195-196-197-198-199-200-201-202-203-204-205-206-207-208-291-292-293-294-295-296-297-298-299-300-301-302-303-304-305-306-307-308-309-310-311-312-313-314-315-316-317-318-319-320-321-322-323-324-325-326-327-328-329-330-331-332-333-334-335-336-337-338-339-340-341-342-343-344-345-346;0;1-2-3-4-5-6-7-8-9-10-11-12-13-14-15;0;1-2-3-4;0;1-2-3-4-5-6-7-8-9-10-11-12-13-14-15;1;4-5;0;1-2-3-4", "snp;snp;probe;probe;probe;snp", null, +// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\eQTMs\\Optimal_PC_and_QTL_Corrected\\RP3_0.25MB_TSS_extendedCis_eQTMs_2015\\eQTLSNPsFDR0.05-SNPLevel_ExtendedInfo13BM.txt"); + +// addAnnotationToQTLOutput( +// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Artificial_eQTMs0.0_Stringent.txt", +// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\Annotation450k_AdditionMJ_v10.txt.gz;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\GC_ContentSuroundingProbes_full.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\statisticsTMM_exprssion.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\GC_ContentSurounding_RP3Genes.txt;D:\\UMCG\\Data\\RP3_RNA_Seq\\annotation_geneIds+overlapping_v71_cut.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\CODAM_NTR_LLS_LLD_RS_BBMRI_450K_var_mean_median.txt", +// "0;8-9-10-11-12-13-14-15-16-17-18-19-20-21-22-23-24-25-26-27-28-29-30-31-32-33-34-35-36-37-38-39-40-41-42-43-44-45-46-47-48-49-50-51-52-53-54-55-56-57-58-59-60-61-62-63-64-65-66-67-68-69-70-71-72-73-74-75-76-77-78-79-80-81-82-83-84-85-86-87-88-89-90-91-92-93-94-95-96-97-98-99-100-101-102-103-104-105-106-107-108-109-110-111-112-113-114-115-116-117-118-119-120-121-122-123-124-125-126-127-128-129-130-131-132-133-134-135-136-137-138-139-140-141-142-143-144-145-146-147-148-149-150-151-152-153-154-155-156-157-158-159-160-161-162-163-164-165-166-167-168-169-170-171-172-173-174-175-176-177-178-179-180-181-182-183-184-185-186-187-188-189-190-191-192-193-194-195-196-197-198-199-200-201-202-203-204-205-206-207-208-209-210-211-212-213-214-215-216-217-218-219-220-221-222-223-224-225-226-227-228-229-230-231-232-233-234-235-236-237-238-239-240-241-242-243-244-245-246-247-248-249-250-251-252-253-254-255-256-257-258-259-260-261-262-263-264-265-266-267-268-269-270-271-272-273-274-275-276-277-278-279-280-281-282-283-284-285-286-287-288-289-290-291-292-293-294-295-296-297-298-299-300-301-302-303-304-305-306-307-308-309-310-311-312-313-314-315-316-317-318-319-320-321-322-323-324-325-326-327-328-329-330-331-332-333-334-335-336-337-338-339-340-341-342-343-344-345-346-347-348-349-350-351-352-353-354-355-356-357;0;1-2-3-4-5-6-7-8-9-10-11-12-13-14-15;0;1-2-3-4;0;1-2-3-4-5-6-7-8-9-10-11-12-13-14-15;1;4-5;0;1-2-3-4", "snp;snp;probe;probe;probe;snp", null, +// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Artificial_eQTMs0.0_Stringent_ExtendedInfo.txt"); + // addAnnotationToQTLOutput( -// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\meQTLs\\Cis_Pc22c_meQTLs\\Comparison_eQTLs_meQTLs.txt", -// "D:\\UMCG\\Methylation_GPL13534\\annotationFile\\Illumina450K_MQtlMappingFile_Extensive.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\Annotation450k_AdditionMJ_v5.txt.gz;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\CODAM_NTR_LLS_LLD_RS_BBMRI_450K_var_mean_median.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\Annotation_InterestCis_MJ_v1.txt.gz", -// "1;8-9-10-11-12-13-14;0;17-18-20-49-50-51-52-53-54-55-56-57-58-59-60-61-62-63-64-65-66-67-68-70-71-81-133-134-141-142-176-177-178-179-180-181-182-183-184-185-186-187-188-189-190-191-148-149-150-151-152-153-154-155-156-157-158-159-160-161-162-163-164-165-166-167-168-169-170-171-172-173-174-175-192-193-194-195-196-197-198;0;1-2-3-4;0;3-4-5-6-7-8-9-10-11-12-13-14-15-16-17-18-19-20-21-22-23-24-25-26-27-28-29-30-31-32-33-34-35-36-37", "probe;probe;probe;snp", null, -// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\meQTLs\\Cis_Pc22c_meQTLs\\Comparison_eQTLs_meQTLs3.txt"); -// +// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\eQTMs\\QTLCorrected\\RP3_0.25MB_TSS_extendedCis_eQTMs_2015\\eQTLSNPsFDR0.05-SNPLevel.txt", +// "D:\\UMCG\\Methylation_GPL13534\\annotationFile\\Illumina450K_MQtlMappingFile_Extensive.txt;D:\\UMCG\\Data\\RP3_RNA_Seq\\annotation_geneIds+overlapping_v71_cut.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\Annotation450k_AdditionMJ_v5.txt.gz;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\CODAM_NTR_LLS_LLD_RS_BBMRI_450K_var_mean_median.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\GC_ContentSuroundingProbes_full.txt;", +// "1;8-9-10-11-12-13-14;1;4-5;0;17-18-20-49-50-51-52-53-54-55-56-57-58-59-60-61-62-63-64-65-66-67-68-70-71-81-133-134-141-142-176-177-178-179-180-181-182-183-184-185-186-187-188-189-190-191-148-149-150-151-152-153-154-155-156-157-158-159-160-161-162-163-164-165-166-167-168-169-170-171-172-173-174-175-192-193-194-195-196-197-198;0;1-2-3-4;0;1-2-3-4-5-6-7-8-9-10-11-12-13-14-15", "snp;probe;snp;snp;snp", null, +// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\eQTMs\\QTLCorrected\\RP3_0.25MB_TSS_extendedCis_eQTMs_2015\\eQTLSNPsFDR0.05-SNPLevel_ExtendedInfo4.txt"); + // addAnnotationToQTLOutput( -// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Comparison_QTLs_eQTMs\\Comparison_forAnnot.txt", -// "D:\\UMCG\\Methylation_GPL13534\\annotationFile\\Illumina450K_MQtlMappingFile_Extensive.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\Annotation450k_AdditionMJ_v5.txt.gz;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\CODAM_NTR_LLS_LLD_RS_BBMRI_450K_var_mean_median.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\Annotation_InterestCis_MJ_v1.txt.gz", -// "1;8-9-10-11-12-13-14;0;17-18-20-49-50-51-52-53-54-55-56-57-58-59-60-61-62-63-64-65-66-67-68-70-71-81-133-134-141-142-176-177-178-179-180-181-182-183-184-185-186-187-188-189-190-191-148-149-150-151-152-153-154-155-156-157-158-159-160-161-162-163-164-165-166-167-168-169-170-171-172-173-174-175-192-193-194-195-196-197-198;0;1-2-3-4;0;3-4-5-6-7-8-9-10-11-12-13-14-15-16-17-18-19-20-21-22-23-24-25-26-27-28-29-30-31-32-33-34-35-36-37", "probe;probe;probe;snp", null, -// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Comparison_QTLs_eQTMs\\Comparison_forAnnot_out.txt"); +// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\eQTMs_Exon\\Optimal_PC_and_QTL_Corrected\\eQTLSNPsFDR0.05-SNPLevel.txt", +// "D:\\UMCG\\Methylation_GPL13534\\annotationFile\\Illumina450K_MQtlMappingFile_Extensive.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\Annotation450k_AdditionMJ_v5.txt.gz;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\CODAM_NTR_LLS_LLD_RS_BBMRI_450K_var_mean_median.txt;D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\Annotations\\GC_ContentSuroundingProbes_full.txt;", +// "1;8-9-10-11-12-13-14;0;17-18-20-49-50-51-52-53-54-55-56-57-58-59-60-61-62-63-64-65-66-67-68-70-71-81-133-134-141-142-176-177-178-179-180-181-182-183-184-185-186-187-188-189-190-191-148-149-150-151-152-153-154-155-156-157-158-159-160-161-162-163-164-165-166-167-168-169-170-171-172-173-174-175-192-193-194-195-196-197-198;0;1-2-3-4;0;1-2-3-4-5-6-7-8-9-10-11-12-13-14-15", "snp;snp;snp;snp", null, +// "D:\\OnlineFolders\\AeroFS\\RP3_BIOS_Methylation\\eQTMs_Exon\\Optimal_PC_and_QTL_Corrected\\eQTLSNPsFDR0.05-SNPLevel_ExtendedInfo2.txt"); +// + + + + } static void addAnnotationToQTLOutput(String in, String sources, String keyValuePairs, String idsToAnnotate, String reannotateGene, String out) throws IOException { diff --git a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/util/QTLDotPlotter.java b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/util/QTLDotPlotter.java index 92183e4ac..f7865fb65 100644 --- a/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/util/QTLDotPlotter.java +++ b/eqtl-mapping-pipeline/src/main/java/eqtlmappingpipeline/util/QTLDotPlotter.java @@ -4,7 +4,7 @@ */ package eqtlmappingpipeline.util; -import com.lowagie.text.DocumentException; +import com.itextpdf.text.DocumentException; import eqtlmappingpipeline.metaqtl3.graphics.EQTLDotPlot; import java.io.IOException; import java.util.logging.Level; diff --git a/eqtl-mapping-pipeline/src/main/scripts/settings.xml b/eqtl-mapping-pipeline/src/main/scripts/settings.xml index 6f1053fed..420e86859 100644 --- a/eqtl-mapping-pipeline/src/main/scripts/settings.xml +++ b/eqtl-mapping-pipeline/src/main/scripts/settings.xml @@ -1,7 +1,7 @@ - - + + 0.95 0.0001 0.05 @@ -20,8 +20,8 @@ fdr 0.05 - probe-level - false + probe-level + false 100 @@ -36,15 +36,15 @@ - + false false true - + - + Dataset1 diff --git a/genetica-libraries/pom.xml b/genetica-libraries/pom.xml index 06700db71..0525d9a67 100644 --- a/genetica-libraries/pom.xml +++ b/genetica-libraries/pom.xml @@ -1,121 +1,121 @@ - - nl.systemsgenetics - systemsgenetics - 1.0.2-SNAPSHOT - - genetica-libraries - 1.0.6-SNAPSHOT - jar - 4.0.0 - - - - org.apache.maven.plugins - maven-compiler-plugin - 2.3.2 - - 1.7 - 1.7 - - - - - - - commons-primitives - commons-primitives - 1.0 - - - nl.systemsgenetics - Genotype-IO - 1.0.1 - - - commons-collections - commons-collections - 3.2.1 - + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + + nl.systemsgenetics + systemsgenetics + 1.0.2-SNAPSHOT + + genetica-libraries + 1.0.7-SNAPSHOT + jar + 4.0.0 + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.3.2 + + 1.7 + 1.7 + + + + + + + commons-primitives + commons-primitives + 1.0 + + + nl.systemsgenetics + Genotype-IO + 1.0.1 + + + commons-collections + commons-collections + 3.2.1 + log4j log4j 1.2.17 - - commons-configuration - commons-configuration - 1.6 - - - commons-lang - commons-lang - 2.5 - - - commons-logging - commons-logging - 1.1.1 - - - org.apache.commons - commons-math3 - 3.2 - - - net.sourceforge.parallelcolt - parallelcolt - 0.10.0 - - - net.sf.jsci - jsci - 1.2 - - - com.google.code.gson - gson - 2.1 - - - com.lowagie - itext - 4.2.1 - - - gov.nist.math - jama - 1.0.2 - - - ca.umontreal.iro - ssj - 2.5 - - - colt - colt - - - - - commons-jxpath - commons-jxpath - 1.3 - - - org.testng - testng - 6.5.2 - test - - - net.sf.trove4j - trove4j - 3.0.3 - jar - - + + commons-configuration + commons-configuration + 1.6 + + + commons-lang + commons-lang + 2.5 + + + commons-logging + commons-logging + 1.1.1 + + + org.apache.commons + commons-math3 + 3.2 + + + net.sourceforge.parallelcolt + parallelcolt + 0.10.0 + + + net.sf.jsci + jsci + 1.2 + + + com.google.code.gson + gson + 2.1 + + + com.itextpdf + itextpdf + 5.5.6 + + + gov.nist.math + jama + 1.0.2 + + + ca.umontreal.iro + ssj + 2.5 + + + colt + colt + + + + + commons-jxpath + commons-jxpath + 1.3 + + + org.testng + testng + 6.5.2 + test + + + net.sf.trove4j + trove4j + 3.0.3 + jar + + \ No newline at end of file diff --git a/genetica-libraries/src/main/java/umcg/genetica/graphics/ForestPlot.java b/genetica-libraries/src/main/java/umcg/genetica/graphics/ForestPlot.java index 947a674d1..e5e0784b0 100644 --- a/genetica-libraries/src/main/java/umcg/genetica/graphics/ForestPlot.java +++ b/genetica-libraries/src/main/java/umcg/genetica/graphics/ForestPlot.java @@ -4,7 +4,7 @@ */ package umcg.genetica.graphics; -import com.lowagie.text.DocumentException; +import com.itextpdf.text.DocumentException; import java.awt.BasicStroke; import java.awt.Color; import java.awt.Font; @@ -85,8 +85,8 @@ public void drawMultiForrestPlot(String xAxisName, String[] yAxisNames, Double[] Locale.setDefault(Locale.US); // set up Graphics2D depending on required format using iText in case PDF Graphics2D g2d = null; - com.lowagie.text.Document document = null; - com.lowagie.text.pdf.PdfWriter writer = null; + com.itextpdf.text.Document document = null; + com.itextpdf.text.pdf.PdfWriter writer = null; BufferedImage bi = null; int width = 1; int height = 1; @@ -130,16 +130,16 @@ public void drawMultiForrestPlot(String xAxisName, String[] yAxisNames, Double[] height = (yAxisNames.length * textpadding) + (2 * textpadding) + (fontheight * yAxisNames.length) + (topMargin * 2) + geneNameMargin + fontheight + topMargin; System.out.println(height); // initialize plot - com.lowagie.text.pdf.PdfContentByte cb = null; + com.itextpdf.text.pdf.PdfContentByte cb = null; if (output == ForestPlot.Output.PDF) { - com.lowagie.text.Rectangle rectangle = new com.lowagie.text.Rectangle(width, height); - document = new com.lowagie.text.Document(rectangle); - writer = com.lowagie.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(filename)); + com.itextpdf.text.Rectangle rectangle = new com.itextpdf.text.Rectangle(width, height); + document = new com.itextpdf.text.Document(rectangle); + writer = com.itextpdf.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(filename)); document.open(); cb = writer.getDirectContent(); cb.saveState(); - //com.lowagie.text.pdf.DefaultFontMapper fontMap = new com.lowagie.text.pdf.DefaultFontMapper(); + //com.itextpdf.text.pdf.DefaultFontMapper fontMap = new com.itextpdf.text.pdf.DefaultFontMapper(); g2d = cb.createGraphics(width, height); } else { bi = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB); diff --git a/genetica-libraries/src/main/java/umcg/genetica/graphics/Heatmap.java b/genetica-libraries/src/main/java/umcg/genetica/graphics/Heatmap.java index 9fcf3b8f6..9549ebdb7 100644 --- a/genetica-libraries/src/main/java/umcg/genetica/graphics/Heatmap.java +++ b/genetica-libraries/src/main/java/umcg/genetica/graphics/Heatmap.java @@ -5,8 +5,8 @@ package umcg.genetica.graphics; import JSci.maths.ArrayMath; -import com.lowagie.text.DocumentException; -import com.lowagie.text.Rectangle; +import com.itextpdf.text.DocumentException; +import com.itextpdf.text.Rectangle; import java.awt.Color; import java.awt.Font; import java.awt.FontMetrics; @@ -62,18 +62,18 @@ public static void drawHeatmap(double[][] values, String[] rowHeaders, String[] } // set up Graphics2D depending on required format using iText in case PDF Graphics2D g2d = null; - com.lowagie.text.Document document = null; - com.lowagie.text.pdf.PdfWriter writer = null; - com.lowagie.text.pdf.PdfContentByte cb = null; + com.itextpdf.text.Document document = null; + com.itextpdf.text.pdf.PdfWriter writer = null; + com.itextpdf.text.pdf.PdfContentByte cb = null; BufferedImage bi = null; if (output == Output.PDF) { Rectangle rectangle = new Rectangle(width, height); - document = new com.lowagie.text.Document(rectangle); - writer = com.lowagie.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(filename)); + document = new com.itextpdf.text.Document(rectangle); + writer = com.itextpdf.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(filename)); document.open(); cb = writer.getDirectContent(); - //com.lowagie.text.pdf.DefaultFontMapper fontMap = new com.lowagie.text.pdf.DefaultFontMapper(); + //com.itextpdf.text.pdf.DefaultFontMapper fontMap = new com.itextpdf.text.pdf.DefaultFontMapper(); cb.saveState(); g2d = cb.createGraphics(width, height); } else { @@ -203,19 +203,19 @@ public static void drawCorrelationHeatmap(double[][] values, String[] rowHeaders // set up Graphics2D depending on required format using iText in case PDF Graphics2D g2d = null; - com.lowagie.text.Document document = null; - com.lowagie.text.pdf.PdfWriter writer = null; - com.lowagie.text.pdf.PdfContentByte cb = null; + com.itextpdf.text.Document document = null; + com.itextpdf.text.pdf.PdfWriter writer = null; + com.itextpdf.text.pdf.PdfContentByte cb = null; BufferedImage bi = null; if (output == Output.PDF) { - com.lowagie.text.Rectangle rectangle = new com.lowagie.text.Rectangle(width, height); - document = new com.lowagie.text.Document(rectangle); - writer = com.lowagie.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(filename)); + com.itextpdf.text.Rectangle rectangle = new com.itextpdf.text.Rectangle(width, height); + document = new com.itextpdf.text.Document(rectangle); + writer = com.itextpdf.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(filename)); document.open(); cb = writer.getDirectContent(); cb.saveState(); - //com.lowagie.text.pdf.DefaultFontMapper fontMap = new com.lowagie.text.pdf.DefaultFontMapper(); + //com.itextpdf.text.pdf.DefaultFontMapper fontMap = new com.itextpdf.text.pdf.DefaultFontMapper(); g2d = cb.createGraphics(width, height); } else { bi = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB); @@ -358,7 +358,7 @@ private static void normalizeCorrelations(double[][] values) { } private Rectangle getScaleGradient(int width, int height) { - com.lowagie.text.Rectangle r = new com.lowagie.text.Rectangle(width, height); + com.itextpdf.text.Rectangle r = new com.itextpdf.text.Rectangle(width, height); return r; } diff --git a/genetica-libraries/src/main/java/umcg/genetica/graphics/ScatterPlot.java b/genetica-libraries/src/main/java/umcg/genetica/graphics/ScatterPlot.java index 2499bbe31..ed5a87597 100644 --- a/genetica-libraries/src/main/java/umcg/genetica/graphics/ScatterPlot.java +++ b/genetica-libraries/src/main/java/umcg/genetica/graphics/ScatterPlot.java @@ -4,9 +4,9 @@ */ package umcg.genetica.graphics; -import com.lowagie.text.DocumentException; -import com.lowagie.text.Rectangle; -import com.lowagie.text.pdf.PdfContentByte; +import com.itextpdf.text.DocumentException; +import com.itextpdf.text.Rectangle; +import com.itextpdf.text.pdf.PdfContentByte; import java.awt.Color; import java.awt.Font; import java.awt.FontMetrics; @@ -48,8 +48,8 @@ public class ScatterPlot { private int fontheight; private OUTPUTFORMAT format; private String outfilename; - private com.lowagie.text.Document document = null; - private com.lowagie.text.pdf.PdfWriter writer = null; + private com.itextpdf.text.Document document = null; + private com.itextpdf.text.pdf.PdfWriter writer = null; private PdfContentByte cb; private int[] category; private Color[] colors; @@ -141,13 +141,13 @@ private void init() { if (format == OUTPUTFORMAT.PDF) { Rectangle rectangle = new Rectangle(graphWidth, graphHeight); - document = new com.lowagie.text.Document(rectangle); + document = new com.itextpdf.text.Document(rectangle); if (!outfilename.toLowerCase().endsWith(".pdf")) { outfilename += ".pdf"; } try { - writer = com.lowagie.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(outfilename)); + writer = com.itextpdf.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(outfilename)); } catch (DocumentException e) { e.printStackTrace(); @@ -158,7 +158,7 @@ private void init() { cb = writer.getDirectContent(); cb.saveState(); -// com.lowagie.text.pdf.DefaultFontMapper fontMap = new com.lowagie.text.pdf.DefaultFontMapper(); +// com.itextpdf.text.pdf.DefaultFontMapper fontMap = new com.itextpdf.text.pdf.DefaultFontMapper(); g2d = cb.createGraphics(graphWidth, graphHeight); } else { bi = new java.awt.image.BufferedImage(graphWidth, graphHeight, java.awt.image.BufferedImage.TYPE_INT_RGB); diff --git a/genetica-libraries/src/main/java/umcg/genetica/graphics/ViolinBoxPlot.java b/genetica-libraries/src/main/java/umcg/genetica/graphics/ViolinBoxPlot.java index a16b32e41..0db332f0e 100644 --- a/genetica-libraries/src/main/java/umcg/genetica/graphics/ViolinBoxPlot.java +++ b/genetica-libraries/src/main/java/umcg/genetica/graphics/ViolinBoxPlot.java @@ -4,7 +4,7 @@ */ package umcg.genetica.graphics; -import com.lowagie.text.DocumentException; +import com.itextpdf.text.DocumentException; import java.awt.BasicStroke; import java.awt.Color; import java.awt.Graphics2D; @@ -54,9 +54,9 @@ public void draw(double[][][] vals, String[] datasetNames, String[][] xLabels, S Locale.setDefault(Locale.US); // set up Graphics2D depending on required format using iText in case PDF Graphics2D g2d = null; - com.lowagie.text.Document document = null; - com.lowagie.text.pdf.PdfWriter writer = null; - com.lowagie.text.pdf.PdfContentByte cb = null; + com.itextpdf.text.Document document = null; + com.itextpdf.text.pdf.PdfWriter writer = null; + com.itextpdf.text.pdf.PdfContentByte cb = null; BufferedImage bi = null; @@ -109,17 +109,17 @@ public void draw(double[][][] vals, String[] datasetNames, String[][] xLabels, S } if (output == Output.PDF) { - com.lowagie.text.Rectangle rectangle = new com.lowagie.text.Rectangle(docWidth, docHeight); - document = new com.lowagie.text.Document(rectangle); + com.itextpdf.text.Rectangle rectangle = new com.itextpdf.text.Rectangle(docWidth, docHeight); + document = new com.itextpdf.text.Document(rectangle); try { - writer = com.lowagie.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(outputFileName)); + writer = com.itextpdf.text.pdf.PdfWriter.getInstance(document, new java.io.FileOutputStream(outputFileName)); } catch (DocumentException e) { throw new IOException(e.fillInStackTrace()); } document.open(); cb = writer.getDirectContent(); cb.saveState(); - //com.lowagie.text.pdf.DefaultFontMapper fontMap = new com.lowagie.text.pdf.DefaultFontMapper(); + //com.itextpdf.text.pdf.DefaultFontMapper fontMap = new com.itextpdf.text.pdf.DefaultFontMapper(); g2d = cb.createGraphics(docWidth, docHeight); } else { bi = new BufferedImage(docWidth, docHeight, BufferedImage.TYPE_INT_RGB); diff --git a/genetica-libraries/src/main/java/umcg/genetica/io/bin/BinaryFile.java b/genetica-libraries/src/main/java/umcg/genetica/io/bin/BinaryFile.java index e7685de19..bae48877d 100644 --- a/genetica-libraries/src/main/java/umcg/genetica/io/bin/BinaryFile.java +++ b/genetica-libraries/src/main/java/umcg/genetica/io/bin/BinaryFile.java @@ -7,8 +7,6 @@ import com.mastfrog.util.streams.HashingOutputStream; import java.io.*; import java.security.NoSuchAlgorithmException; -import java.util.logging.Level; -import java.util.logging.Logger; /** * @@ -16,193 +14,193 @@ */ public class BinaryFile { - public static final boolean W = true; - public static final boolean R = false; - protected final DataOutputStream os; - protected final DataInputStream is; - protected final String loc; - protected final boolean writeable; - private final HashingOutputStream osh; - - public BinaryFile(String loc, boolean mode) throws IOException { - if (loc.trim().length() == 0) { - throw new IOException("Could not find file: no file specified"); - } - this.writeable = mode; - this.loc = loc; - - if (writeable) { - try { - is = null; - osh = new HashingOutputStream("md5", new FileOutputStream(loc)); - os = new DataOutputStream(new BufferedOutputStream(osh)); - } catch (NoSuchAlgorithmException ex) { - throw new RuntimeException(ex); - } - } else { - is = new DataInputStream(new BufferedInputStream(new FileInputStream(loc))); - os = null; - osh = null; - } - } - - public BinaryFile(String loc, boolean mode, int buffersize) throws IOException { - if (loc.trim().length() == 0) { - throw new IOException("Could not find file: no file specified"); - } - this.writeable = mode; - this.loc = loc; - - if (writeable) { - try { - is = null; - osh = new HashingOutputStream("md5", new FileOutputStream(loc)); - os = new DataOutputStream(new BufferedOutputStream(osh, buffersize)); - } catch (NoSuchAlgorithmException ex) { - throw new RuntimeException(ex); - } - } else { - is = new DataInputStream(new BufferedInputStream(new FileInputStream(loc), buffersize)); - os = null; - osh = null; - } - } - - public void writeBytes(byte[] v) throws IOException { - if (writeable) { - os.write(v); - } else { - throw new IOException("File is read only."); - } - } - - public void writeInt(int v) throws IOException { - if (writeable) { - os.writeInt(v); - } else { - throw new IOException("File is read only."); - } - } - - public void writeString(String s) throws IOException { - if (writeable) { - os.writeChars(s); - } else { - throw new IOException("File is read only."); - } - } - - public void writeBool(boolean b) throws IOException { - if (writeable) { - os.writeBoolean(b); - } else { - throw new IOException("File is read only."); - } - } - - public void writeFloat(float f) throws IOException { - if (writeable) { - os.writeFloat(f); - } else { - throw new IOException("File is read only."); - } - } - - public void writeDouble(double d) throws IOException { - if (writeable) { - os.writeDouble(d); - } else { - throw new IOException("File is read only."); - } - } - - public void writeLong(long l) throws IOException { - if (writeable) { - os.writeLong(l); - } else { - throw new IOException("File is read only."); - } - } - - // read functions - public int readInt() throws IOException, EOFException { - if (writeable) { - throw new IOException("File is write only."); - } else { - return is.readInt(); - } - } - - public boolean readBool() throws IOException, EOFException { - if (writeable) { - throw new IOException("File is write only."); - } else { - return is.readBoolean(); - } - } - - public String readString() throws IOException, EOFException { - if (writeable) { - throw new IOException("File is write only."); - } else { - return is.readUTF(); - } - } - - public float readFloat() throws IOException, EOFException { - if (writeable) { - throw new IOException("File is write only."); - } else { - return is.readFloat(); - - } - } - - public double readDouble() throws IOException, EOFException { - if (writeable) { - throw new IOException("File is write only."); - } else { - return is.readDouble(); - } - } - - public long readLong() throws IOException, EOFException { - if (writeable) { - throw new IOException("File is write only."); - } else { - return is.readLong(); - } - } - - public void close() throws IOException { - if (writeable) { - os.close(); - } else { - is.close(); - } - } - - public void writeByte(byte b) throws IOException { - if (writeable) { - os.writeByte(b); - } else { - throw new IOException("File is read only."); - } - } - - public int read() throws IOException { - return is.read(); - } - - public void write(int b) throws IOException { - os.write(b); - } - - public byte[] getWrittenHash() throws IOException { - if (writeable) { - return osh.getDigest(); - } else { - return null; - } - } + public static final boolean W = true; + public static final boolean R = false; + protected final DataOutputStream os; + protected final DataInputStream is; + protected final String loc; + protected final boolean writeable; + private final HashingOutputStream osh; + + public BinaryFile(String loc, boolean mode) throws IOException { + if (loc.trim().length() == 0) { + throw new IOException("Could not find file: no file specified"); + } + this.writeable = mode; + this.loc = loc; + + if (writeable) { + try { + is = null; + osh = new HashingOutputStream("md5", new FileOutputStream(loc)); + os = new DataOutputStream(new BufferedOutputStream(osh, 32 * 1024)); + } catch (NoSuchAlgorithmException ex) { + throw new RuntimeException(ex); + } + } else { + is = new DataInputStream(new BufferedInputStream(new FileInputStream(loc))); + os = null; + osh = null; + } + } + + public BinaryFile(String loc, boolean mode, int buffersize) throws IOException { + if (loc.trim().length() == 0) { + throw new IOException("Could not find file: no file specified"); + } + this.writeable = mode; + this.loc = loc; + + if (writeable) { + try { + is = null; + osh = new HashingOutputStream("md5", new FileOutputStream(loc)); + os = new DataOutputStream(new BufferedOutputStream(osh, buffersize)); + } catch (NoSuchAlgorithmException ex) { + throw new RuntimeException(ex); + } + } else { + is = new DataInputStream(new BufferedInputStream(new FileInputStream(loc), buffersize)); + os = null; + osh = null; + } + } + + public void writeBytes(byte[] v) throws IOException { + if (writeable) { + os.write(v); + } else { + throw new IOException("File is read only."); + } + } + + public void writeInt(int v) throws IOException { + if (writeable) { + os.writeInt(v); + } else { + throw new IOException("File is read only."); + } + } + + public void writeString(String s) throws IOException { + if (writeable) { + os.writeChars(s); + } else { + throw new IOException("File is read only."); + } + } + + public void writeBool(boolean b) throws IOException { + if (writeable) { + os.writeBoolean(b); + } else { + throw new IOException("File is read only."); + } + } + + public void writeFloat(float f) throws IOException { + if (writeable) { + os.writeFloat(f); + } else { + throw new IOException("File is read only."); + } + } + + public void writeDouble(double d) throws IOException { + if (writeable) { + os.writeDouble(d); + } else { + throw new IOException("File is read only."); + } + } + + public void writeLong(long l) throws IOException { + if (writeable) { + os.writeLong(l); + } else { + throw new IOException("File is read only."); + } + } + + // read functions + public int readInt() throws IOException, EOFException { + if (writeable) { + throw new IOException("File is write only."); + } else { + return is.readInt(); + } + } + + public boolean readBool() throws IOException, EOFException { + if (writeable) { + throw new IOException("File is write only."); + } else { + return is.readBoolean(); + } + } + + public String readString() throws IOException, EOFException { + if (writeable) { + throw new IOException("File is write only."); + } else { + return is.readUTF(); + } + } + + public float readFloat() throws IOException, EOFException { + if (writeable) { + throw new IOException("File is write only."); + } else { + return is.readFloat(); + + } + } + + public double readDouble() throws IOException, EOFException { + if (writeable) { + throw new IOException("File is write only."); + } else { + return is.readDouble(); + } + } + + public long readLong() throws IOException, EOFException { + if (writeable) { + throw new IOException("File is write only."); + } else { + return is.readLong(); + } + } + + public void close() throws IOException { + if (writeable) { + os.close(); + } else { + is.close(); + } + } + + public void writeByte(byte b) throws IOException { + if (writeable) { + os.writeByte(b); + } else { + throw new IOException("File is read only."); + } + } + + public int read() throws IOException { + return is.read(); + } + + public void write(int b) throws IOException { + os.write(b); + } + + public byte[] getWrittenHash() throws IOException { + if (writeable) { + return osh.getDigest(); + } else { + return null; + } + } } diff --git a/genetica-libraries/src/main/java/umcg/genetica/io/chrContacts/InterChrContact.java b/genetica-libraries/src/main/java/umcg/genetica/io/chrContacts/InterChrContact.java new file mode 100644 index 000000000..d1fe25323 --- /dev/null +++ b/genetica-libraries/src/main/java/umcg/genetica/io/chrContacts/InterChrContact.java @@ -0,0 +1,52 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package umcg.genetica.io.chrContacts; + +/** + * + * @author MaKKie_Admin + */ +public class InterChrContact implements Comparable { + + final int chrLocationSmaller; + final int chrLocationLarger; + final double contactValue; + + public InterChrContact(int chrLocSmal, int chrLocLarge, double contactVal) { + this.chrLocationLarger = chrLocLarge; + this.chrLocationSmaller = chrLocSmal; + this.contactValue = contactVal; + } + + @Override + public int compareTo(InterChrContact other) { + if (other.getChrLocationSmaller() > this.chrLocationSmaller) { + return -1; + } else if (other.getChrLocationSmaller() < this.chrLocationSmaller) { + return 1; + } else { + if (other.getChrLocationLarger() > this.chrLocationLarger) { + return -1; + } else if (other.getChrLocationLarger() < this.chrLocationLarger) { + return 1; + } else { + return 0; + } + } + } + + public int getChrLocationSmaller() { + return chrLocationSmaller; + } + + public int getChrLocationLarger() { + return chrLocationLarger; + } + + public double getContactValue() { + return contactValue; + } +} diff --git a/genetica-libraries/src/main/java/umcg/genetica/io/chrContacts/SortInterChrContacts.java b/genetica-libraries/src/main/java/umcg/genetica/io/chrContacts/SortInterChrContacts.java new file mode 100644 index 000000000..20875f8e5 --- /dev/null +++ b/genetica-libraries/src/main/java/umcg/genetica/io/chrContacts/SortInterChrContacts.java @@ -0,0 +1,76 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package umcg.genetica.io.chrContacts; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.apache.commons.lang3.StringUtils; +import umcg.genetica.io.text.TextFile; + +/** + * + * @author MaKKie_Admin + */ +public class SortInterChrContacts { + + public static void readNonSortedWriteSorted(String fileToReads, String fileToWrite){ + ArrayList contacts = null; + try { + contacts = readRawInterContactInformation(fileToReads); + } catch (IOException ex) { + Logger.getLogger(SortInterChrContacts.class.getName()).log(Level.SEVERE, null, ex); + } + Collections.sort(contacts); + + try { + writeRawInterContactInformation(contacts, fileToWrite); + } catch (IOException ex) { + Logger.getLogger(SortInterChrContacts.class.getName()).log(Level.SEVERE, null, ex); + } + + } + + private static ArrayList readRawInterContactInformation(String fileToReads) throws IOException { + ArrayList chrContactInfo = new ArrayList(); + + BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(fileToReads), "UTF-8")); + + String row; + + while ((row = input.readLine()) != null) { + String[] parts = StringUtils.split(row, '\t'); + + int posChr1 = Integer.parseInt(parts[0]); + int posChr2 = Integer.parseInt(parts[1]); + double contact = Double.parseDouble(parts[2]); + chrContactInfo.add(new InterChrContact(posChr1, posChr2, contact)); + } + input.close(); + return chrContactInfo; + + } + + private static ArrayList writeRawInterContactInformation(ArrayList contacts, String fileToWrite) throws IOException { + ArrayList chrContactInfo = new ArrayList(); + + TextFile outWriter = new TextFile(fileToWrite, TextFile.W); + + String row; + + for(InterChrContact contact : contacts){ + outWriter.writeln(contact.getChrLocationSmaller()+"\t"+contact.getChrLocationLarger()+"\t"+contact.getContactValue()); + } + outWriter.close(); + return chrContactInfo; + + } +} diff --git a/genetica-libraries/src/main/java/umcg/genetica/io/probemapping/reading.java b/genetica-libraries/src/main/java/umcg/genetica/io/probemapping/reading.java index 25050ca48..4d6638f01 100644 --- a/genetica-libraries/src/main/java/umcg/genetica/io/probemapping/reading.java +++ b/genetica-libraries/src/main/java/umcg/genetica/io/probemapping/reading.java @@ -4,6 +4,7 @@ */ package umcg.genetica.io.probemapping; +import gnu.trove.map.hash.THashMap; import java.awt.TextField; import java.io.BufferedReader; import java.io.File; @@ -398,8 +399,8 @@ private static int getNrNs(String string) { * @param sizeMap * @return */ - public static HashMap> readAnnotationFile(String annotationFile, int storingId, int sizeMap) { - HashMap> probeInfo = new HashMap>((int) Math.ceil(sizeMap / 0.75)); + public static THashMap> readAnnotationFile(String annotationFile, int storingId, int sizeMap) { + THashMap> probeInfo = new THashMap>((int) Math.ceil(sizeMap / 0.75)); int entryId = 0; try { TextFile in = new TextFile(annotationFile, TextFile.R); @@ -411,7 +412,7 @@ public static HashMap> readAnnotationFile(String while ((str = in.readLine()) != null) { String[] strParts = SPLIT_ON_TAB.split(str); - HashMap t = new HashMap((int) Math.ceil(header.length / 0.75)); + THashMap t = new THashMap((int) Math.ceil(header.length / 0.75)); for (int i = 0; i < strParts.length; ++i) { if (i != storingId) { t.put(header[i], strParts[i]); diff --git a/genetica-libraries/src/main/java/umcg/genetica/io/trityper/bin/BinaryResultDataset.java b/genetica-libraries/src/main/java/umcg/genetica/io/trityper/bin/BinaryResultDataset.java index 8bd3599a2..19c9c1dca 100644 --- a/genetica-libraries/src/main/java/umcg/genetica/io/trityper/bin/BinaryResultDataset.java +++ b/genetica-libraries/src/main/java/umcg/genetica/io/trityper/bin/BinaryResultDataset.java @@ -5,191 +5,188 @@ package umcg.genetica.io.trityper.bin; +import umcg.genetica.console.ConsoleGUIElems; +import umcg.genetica.console.ProgressBar; + import java.io.IOException; import java.util.HashMap; import java.util.zip.DataFormatException; -import umcg.genetica.console.ConsoleGUIElems; -import umcg.genetica.console.ProgressBar; /** - * * @author harmjan */ public class BinaryResultDataset { - private String m_name; - private String m_location; - private BinaryResultSNP[] snps; - private HashMap stringToSNP = new HashMap(); - private BinaryResultProbe[] probes; - private HashMap stringToProbe = new HashMap(); - private BinaryGZipFloatMatrix bgfm; - private int maxNrSamples; -// private long[] filepointers; - private float maxfloat = Float.MIN_VALUE; - private float minfloat = Float.MIN_VALUE; - private int numprobes; + private String m_name; + private String m_location; + private BinaryResultSNP[] snps; + private HashMap stringToSNP = new HashMap(); + private BinaryResultProbe[] probes; + private HashMap stringToProbe = new HashMap(); + private BinaryGZipFloatMatrix bgfm; + private int maxNrSamples; + // private long[] filepointers; + private float maxfloat = Float.MIN_VALUE; + private float minfloat = Float.MIN_VALUE; + private int numprobes; + + public BinaryResultDataset(String location, String name, int permutation) throws IOException { + m_location = location; + m_name = name; + System.out.println("Loading " + name + " from " + location); + if (permutation == 0) { + load(m_location + m_name + ".ProbeSummary.dat", m_location + m_name + ".SNPSummary.dat", m_location + m_name + ".ZScoreMatrix.dat"); + } else { + load(m_location + m_name + ".ProbeSummary.dat", m_location + m_name + "-PermutationRound-" + permutation + ".SNPSummary.dat", m_location + m_name + "-PermutationRound-" + permutation + ".ZScoreMatrix.dat"); + } + + } - public BinaryResultDataset(String location, String name, int permutation) throws IOException { - m_location = location; - m_name = name; - System.out.println("Loading "+name+" from "+location); - if(permutation == 0){ - load(m_location+m_name+".ProbeSummary.dat",m_location+m_name+".SNPSummary.dat",m_location+m_name+".ZScoreMatrix.dat"); - } else { - load(m_location+m_name+".ProbeSummary.dat",m_location+m_name+"-PermutationRound-"+permutation+".SNPSummary.dat",m_location+m_name+"-PermutationRound-"+permutation+".ZScoreMatrix.dat"); - } + private void load(String probesummaryloc, String snpsummaryloc, String zscoreloc) throws IOException { + System.out.println("Loading files: \n - " + probesummaryloc + "\n - " + snpsummaryloc + "\n - " + zscoreloc); + BinaryResultProbeSummary ps = new BinaryResultProbeSummary(probesummaryloc, BinaryResultProbeSummary.R); + BinaryResultSNPSummary ss = new BinaryResultSNPSummary(snpsummaryloc, BinaryResultSNPSummary.R); - } + snps = ss.readAllSNPs(); - private void load(String probesummaryloc, String snpsummaryloc, String zscoreloc) throws IOException { - System.out.println("Loading files: \n - "+probesummaryloc+"\n - "+snpsummaryloc +"\n - "+zscoreloc); - BinaryResultProbeSummary ps = new BinaryResultProbeSummary(probesummaryloc, BinaryResultProbeSummary.R); - BinaryResultSNPSummary ss = new BinaryResultSNPSummary(snpsummaryloc, BinaryResultSNPSummary.R); + probes = ps.readAllProbes(); + for (BinaryResultSNP s : snps) { + stringToSNP.put(s.getName().intern(), s); + } + for (BinaryResultProbe p : probes) { + stringToProbe.put(p.getName().intern(), p); + } + System.out.print("Dataset\t" + m_name + "\n" + ConsoleGUIElems.LINE); + System.out.println(snps.length + "\t\tSNPs read."); + System.out.println(probes.length + "\t\tProbes read."); + System.out.println(ss.getMaxNrSamples() + " samples."); + this.maxNrSamples = ss.getMaxNrSamples(); - snps = ss.readAllSNPs(); - probes = ps.readAllProbes(); + ps.close(); + ss.close(); - for(BinaryResultSNP s: snps){ - stringToSNP.put(s.getName(), s); - } - for(BinaryResultProbe p: probes){ - stringToProbe.put(p.getName(), p); - } - System.out.print("Dataset\t"+m_name+"\n"+ConsoleGUIElems.LINE); - System.out.println(snps.length+"\t\tSNPs read."); - System.out.println(probes.length+"\t\tProbes read."); - System.out.println(ss.getMaxNrSamples() +" samples."); - this.maxNrSamples = ss.getMaxNrSamples(); + bgfm = new BinaryGZipFloatMatrix(zscoreloc, BinaryGZipFloatMatrix.R); +// checkMatrix(); + numprobes = probes.length; + System.out.println(ConsoleGUIElems.LINE); + } + public void closeMatrix() throws IOException { + if (bgfm != null) { + bgfm.close(); + bgfm = null; + } + } - ps.close(); - ss.close(); + public void openMatrix(int permutation) throws IOException { + closeMatrix(); + } + /** + * @return the m_name + */ + public String getM_name() { + return m_name; + } - bgfm = new BinaryGZipFloatMatrix(zscoreloc, BinaryGZipFloatMatrix.R); -// checkMatrix(); - numprobes = probes.length; - System.out.println(ConsoleGUIElems.LINE); - } - - public void closeMatrix() throws IOException { - if(bgfm != null){ - bgfm.close(); - bgfm = null; - } - } - - - public void openMatrix(int permutation) throws IOException { - closeMatrix(); - } - - /** - * @return the m_name - */ - public String getM_name() { - return m_name; - } - - /** - * @param m_name the m_name to set - */ - public void setM_name(String m_name) { - this.m_name = m_name; - } - - /** - * @return the m_location - */ - public String getM_location() { - return m_location; - } - - /** - * @param m_location the m_location to set - */ - public void setM_location(String m_location) { - this.m_location = m_location; - } - - /** - * @return the snps - */ - public BinaryResultSNP[] getSnps() { - return snps; - } - - /** - * @param snps the snps to set - */ - public void setSnps(BinaryResultSNP[] snps) { - this.snps = snps; - } - - /** - * @return the stringToSNP - */ - public HashMap getStringToSNP() { - return stringToSNP; - } - - /** - * @param stringToSNP the stringToSNP to set - */ - public void setStringToSNP(HashMap stringToSNP) { - this.stringToSNP = stringToSNP; - } - - /** - * @return the probes - */ - public BinaryResultProbe[] getProbes() { - return probes; - } - - /** - * @param probes the probes to set - */ - public void setProbes(BinaryResultProbe[] probes) { - this.probes = probes; - } - - /** - * @return the stringToProbe - */ - public HashMap getStringToProbe() { - return stringToProbe; - } - - /** - * @param stringToProbe the stringToProbe to set - */ - public void setStringToProbe(HashMap stringToProbe) { - this.stringToProbe = stringToProbe; - } - - private void checkMatrix() throws IOException { - System.out.println("Detecting whether binary matrix corresponds to SNP and Probe definition."); - long expectedsize = (long) snps.length*probes.length; - System.out.println("Expected matrix size:\t"+expectedsize+" Z-scores"); - System.out.println("Checking matrix: "); - ProgressBar pb = new ProgressBar(snps.length); - long count = 0; - for(int i=0; i getStringToSNP() { + return stringToSNP; + } + + /** + * @param stringToSNP the stringToSNP to set + */ + public void setStringToSNP(HashMap stringToSNP) { + this.stringToSNP = stringToSNP; + } + + /** + * @return the probes + */ + public BinaryResultProbe[] getProbes() { + return probes; + } + + /** + * @param probes the probes to set + */ + public void setProbes(BinaryResultProbe[] probes) { + this.probes = probes; + } + + /** + * @return the stringToProbe + */ + public HashMap getStringToProbe() { + return stringToProbe; + } + + /** + * @param stringToProbe the stringToProbe to set + */ + public void setStringToProbe(HashMap stringToProbe) { + this.stringToProbe = stringToProbe; + } + + private void checkMatrix() throws IOException { + System.out.println("Detecting whether binary matrix corresponds to SNP and Probe definition."); + long expectedsize = (long) snps.length * probes.length; + System.out.println("Expected matrix size:\t" + expectedsize + " Z-scores"); + System.out.println("Checking matrix: "); + ProgressBar pb = new ProgressBar(snps.length); + long count = 0; + for (int i = 0; i < snps.length; i++) { + long index = snps[i].getzScoreIndex(); + long next = -1; + if (i + 1 < snps.length) { + next = snps[i + 1].getzScoreIndex(); + } + + try { + bgfm.read(index, next, probes.length); // for(int f=0; f probes = new ArrayList(); - BinaryResultProbe probe = readNextProbe(); + public BinaryResultProbe[] readAllProbes() throws IOException { + ArrayList probes = new ArrayList(); + BinaryResultProbe probe = readNextProbe(); - int ct = 0; - while (probe != null) { - probes.add(probe); - probe = readNextProbe(); - } + int ct = 0; + while (probe != null) { + probes.add(probe); + probe = readNextProbe(); + } - BinaryResultProbe[] probelist = new BinaryResultProbe[probes.size()]; - for (int p = 0; p < probelist.length; p++) { - probelist[p] = probes.get(p); + BinaryResultProbe[] probelist = new BinaryResultProbe[probes.size()]; + for (int p = 0; p < probelist.length; p++) { + probelist[p] = probes.get(p); + } + return probelist; } - return probelist; - } - - public BinaryResultProbe readNextProbe() throws IOException { - BinaryResultProbe p = null; - try { - p = new BinaryResultProbe(); - p.setId(in.readInt()); - p.setName(in.readUTF()); - p.setChr(in.readByte()); - p.setMidpoint(in.readInt()); - p.setAnnotation(in.readUTF()); - } catch (EOFException e) { - return null; + + public BinaryResultProbe readNextProbe() throws IOException { + BinaryResultProbe p = null; + try { + p = new BinaryResultProbe(); + p.setId(in.readInt()); + p.setName(in.readUTF().intern()); + p.setChr(in.readByte()); + p.setMidpoint(in.readInt()); + p.setAnnotation(in.readUTF().intern()); + } catch (EOFException e) { + return null; + } + return p; } - return p; - } } diff --git a/genetica-libraries/src/main/java/umcg/genetica/io/trityper/bin/BinaryResultSNPSummary.java b/genetica-libraries/src/main/java/umcg/genetica/io/trityper/bin/BinaryResultSNPSummary.java index 36c80a663..01bf43e3c 100644 --- a/genetica-libraries/src/main/java/umcg/genetica/io/trityper/bin/BinaryResultSNPSummary.java +++ b/genetica-libraries/src/main/java/umcg/genetica/io/trityper/bin/BinaryResultSNPSummary.java @@ -92,7 +92,7 @@ public BinaryResultSNP readNextSNP() throws IOException { byte[] alleles = new byte[2]; s = new BinaryResultSNP(); s.setId(in.readInt()); - s.setName(in.readUTF()); + s.setName(in.readUTF().intern()); s.setChr(in.readByte()); s.setChrpos(in.readInt()); s.setHwe(in.readDouble()); diff --git a/genetica-libraries/src/main/java/umcg/genetica/io/trityper/probeannotation/ProbeTranslation.java b/genetica-libraries/src/main/java/umcg/genetica/io/trityper/probeannotation/ProbeTranslation.java index abfe6da5f..252f1ee89 100644 --- a/genetica-libraries/src/main/java/umcg/genetica/io/trityper/probeannotation/ProbeTranslation.java +++ b/genetica-libraries/src/main/java/umcg/genetica/io/trityper/probeannotation/ProbeTranslation.java @@ -4,11 +4,12 @@ */ package umcg.genetica.io.trityper.probeannotation; -import java.io.IOException; -import java.util.HashMap; import umcg.genetica.io.text.TextFile; import umcg.genetica.io.trityper.util.ChrAnnotation; +import java.io.IOException; +import java.util.HashMap; + /** * * @author harmjan @@ -64,7 +65,7 @@ public void load(String probeAnnotation) throws IOException { String symbol = elems[4]; num = 0; - probeName[probeNum] = elems[0]; + probeName[probeNum] = elems[0].intern(); byte bchr = -1; try { @@ -107,7 +108,7 @@ public void load(String probeAnnotation) throws IOException { actualMappingPosition.put(probeNum, chrpos); probeChr[probeNum] = bchr; probeChrPos[probeNum] = bchrpos; - probeSymbol[probeNum] = symbol; + probeSymbol[probeNum] = symbol.intern(); for (int i = 5; i < elems.length; i++) { @@ -116,9 +117,9 @@ public void load(String probeAnnotation) throws IOException { try { String[] addresselems = arrayaddress.split(","); for (int q = 0; q < addresselems.length; q++) { - String address = addresselems[q]; + String address = addresselems[q].intern(); - oldToNewProbeAddress.put(annotationname[i - 5] + address, probeNum); + oldToNewProbeAddress.put(annotationname[i - 5] + address.intern(), probeNum); } diff --git a/genetica-libraries/src/main/java/umcg/genetica/math/matrix2/DoubleMatrixDataset.java b/genetica-libraries/src/main/java/umcg/genetica/math/matrix2/DoubleMatrixDataset.java index b7664b3ed..2230738c9 100644 --- a/genetica-libraries/src/main/java/umcg/genetica/math/matrix2/DoubleMatrixDataset.java +++ b/genetica-libraries/src/main/java/umcg/genetica/math/matrix2/DoubleMatrixDataset.java @@ -4,6 +4,7 @@ */ package umcg.genetica.math.matrix2; +import cern.colt.matrix.tdouble.DoubleMatrix1D; import cern.colt.matrix.tdouble.DoubleMatrix2D; import cern.colt.matrix.tdouble.impl.DenseDoubleMatrix2D; import cern.colt.matrix.tdouble.impl.DenseLargeDoubleMatrix2D; @@ -13,6 +14,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.LinkedHashSet; @@ -34,624 +36,646 @@ */ public class DoubleMatrixDataset { - static final IOException doubleMatrixDatasetNonUniqueHeaderException = new IOException("Tried to use a non-unique header set in an identifier HashMap"); - static final Logger LOGGER = Logger.getLogger(DoubleMatrixDataset.class.getName()); - protected DoubleMatrix2D matrix; - protected LinkedHashMap hashRows; - protected LinkedHashMap hashCols; - - public DoubleMatrixDataset() { - hashRows = new LinkedHashMap(); - hashCols = new LinkedHashMap(); - } - - public DoubleMatrixDataset(int rows, int columns) { - hashRows = new LinkedHashMap((int) Math.ceil(rows / 0.75)); - hashCols = new LinkedHashMap((int) Math.ceil(columns / 0.75)); - if ((rows * (long) columns) < (Integer.MAX_VALUE - 2)) { - matrix = new DenseDoubleMatrix2D(rows, columns); - } else { - matrix = new DenseLargeDoubleMatrix2D(rows, columns); - } - } - - public DoubleMatrixDataset(LinkedHashMap hashRows, LinkedHashMap hashCols) { - this.hashRows = hashRows; - this.hashCols = hashCols; - if ((hashRows.size() * (long) hashCols.size()) < (Integer.MAX_VALUE - 2)) { - matrix = new DenseDoubleMatrix2D(hashRows.size(), hashCols.size()); - } else { - matrix = new DenseLargeDoubleMatrix2D(hashRows.size(), hashCols.size()); - } - } - - public DoubleMatrixDataset(DoubleMatrix2D matrix, LinkedHashMap hashRows, LinkedHashMap hashCols) { - this.hashRows = hashRows; - this.hashCols = hashCols; - this.matrix = matrix; - } - - public DoubleMatrixDataset(List rowNames, List colNames) { - - hashRows = new LinkedHashMap(rowNames.size()); - hashCols = new LinkedHashMap(colNames.size()); - - int i = 0; - for (R row : rowNames) { - hashRows.put(row, i); - ++i; - } - - i = 0; - for (C col : colNames) { - hashCols.put(col, i); - } - - if ((hashRows.size() * (long) hashCols.size()) < (Integer.MAX_VALUE - 2)) { - matrix = new DenseDoubleMatrix2D(hashRows.size(), hashCols.size()); - } else { - matrix = new DenseLargeDoubleMatrix2D(hashRows.size(), hashCols.size()); - } - - } - - public static DoubleMatrixDataset loadDoubleData(String fileName) throws IOException { - if ((fileName.endsWith(".txt") || fileName.endsWith(".tsv") || fileName.endsWith(".txt.gz"))) { - return loadDoubleTextData(fileName, "\t"); - } else if (fileName.endsWith(".binary")) { - return loadDoubleBinaryData(fileName); - } else { - throw new IllegalArgumentException("File type must be \".txt\", \".tsv\" or \".txt.gz\" when delimiter is set to: \"tab\" \n Input filename: " + fileName); - } - } - - public static DoubleMatrixDataset loadDoubleTextData(String fileName, String delimiter) throws IOException { - if (!(fileName.endsWith(".txt") || fileName.endsWith(".tsv") || fileName.endsWith(".txt.gz"))) { - throw new IllegalArgumentException("File type must be \".txt\", \".tsv\" or \".txt.gz\" when delimiter is set. \n Input filename: " + fileName); - } - - Pattern splitPatern = Pattern.compile(delimiter); - - int columnOffset = 1; - - TextFile in = new TextFile(fileName, TextFile.R); - String str = in.readLine(); // header - String[] data = splitPatern.split(str); - - int tmpCols = (data.length - columnOffset); - - LinkedHashMap colMap = new LinkedHashMap((int) Math.ceil(tmpCols / 0.75)); - - for (int s = 0; s < tmpCols; s++) { - String colName = data[s + columnOffset]; - if (!colMap.containsKey(colName)) { - colMap.put(colName, s); - } else { - LOGGER.warning("Duplicated column name!"); - throw (doubleMatrixDatasetNonUniqueHeaderException); - } - } - - int tmpRows = 0; - - while (in.readLine() != null) { - tmpRows++; - } - in.close(); - - LinkedHashMap rowMap = new LinkedHashMap((int) Math.ceil(tmpRows / 0.75)); - DoubleMatrix2D tmpMatrix; - - if ((tmpRows * (long) tmpCols) < (Integer.MAX_VALUE - 2)) { - tmpMatrix = new DenseDoubleMatrix2D(tmpRows, tmpCols); - } else { - tmpMatrix = new DenseLargeDoubleMatrix2D(tmpRows, tmpCols); - } - in.open(); - in.readLine(); // read header - int row = 0; - - boolean correctData = true; - while ((str = in.readLine()) != null) { - data = splitPatern.split(str); - - if (!rowMap.containsKey(data[0])) { - rowMap.put(data[0], row); - for (int s = 0; s < tmpCols; s++) { - double d; - try { - d = Double.parseDouble(data[s + columnOffset]); - } catch (NumberFormatException e) { - correctData = false; - d = Double.NaN; - } - tmpMatrix.setQuick(row, s, d); - } - row++; - } else { - LOGGER.warning("Duplicated row name!"); - throw (doubleMatrixDatasetNonUniqueHeaderException); - } - - } - if (!correctData) { - LOGGER.warning("Your data contains NaN/unparseable values!"); - } - in.close(); - - DoubleMatrixDataset dataset = new DoubleMatrixDataset(tmpMatrix, rowMap, colMap); - - LOGGER.log(Level.INFO, "''{0}'' has been loaded, nrRows: {1} nrCols: {2}", new Object[]{fileName, dataset.matrix.rows(), dataset.matrix.columns()}); - return dataset; - } - - public static DoubleMatrixDataset loadSubsetOfTextDoubleData(String fileName, String delimiter, HashSet desiredRows, HashSet desiredCols) throws IOException { - if (!(fileName.endsWith(".txt") || fileName.endsWith(".txt.gz"))) { - throw new IllegalArgumentException("File type must be .txt when delimiter is given (given filename: " + fileName + ")"); - } - - LinkedHashSet desiredColPos = new LinkedHashSet(); - - Pattern splitPatern = Pattern.compile(delimiter); - - int columnOffset = 1; - - TextFile in = new TextFile(fileName, TextFile.R); - String str = in.readLine(); // header - String[] data = splitPatern.split(str); - - int tmpCols = (data.length - columnOffset); - - LinkedHashMap colMap = new LinkedHashMap((int) Math.ceil(tmpCols / 0.75)); - - int storedCols = 0; - for (int s = 0; s < tmpCols; s++) { - String colName = data[s + columnOffset]; - if (!colMap.containsKey(colName) && (desiredCols == null || desiredCols.contains(colName) || desiredCols.isEmpty())) { - colMap.put(colName, storedCols); - desiredColPos.add((s)); - storedCols++; - } else if (colMap.containsKey(colName)) { - LOGGER.warning("Duplicated column name!"); - System.out.println("Tried to add: " + colName); - throw (doubleMatrixDatasetNonUniqueHeaderException); - } - } - - LinkedHashSet desiredRowPos = new LinkedHashSet(); - int rowsToStore = 0; - int totalRows = 0; - //System.out.println(desiredRows.toString()); - while ((str = in.readLine()) != null) { - String[] info = splitPatern.split(str); - if (desiredRows == null || desiredRows.contains(info[0]) || desiredRows.isEmpty()) { - rowsToStore++; - desiredRowPos.add(totalRows); - } - totalRows++; - } - in.close(); - - DoubleMatrix2D matrix; - if ((rowsToStore * (long) tmpCols) < (Integer.MAX_VALUE - 2)) { - matrix = new DenseDoubleMatrix2D(rowsToStore, storedCols); - } else { - matrix = new DenseLargeDoubleMatrix2D(rowsToStore, storedCols); - } - - in.open(); - in.readLine(); // read header - int storingRow = 0; - totalRows = 0; - LinkedHashMap rowMap = new LinkedHashMap((int) Math.ceil(rowsToStore / 0.75)); - - boolean correctData = true; - while ((str = in.readLine()) != null) { - - if (desiredRowPos.contains(totalRows)) { - data = splitPatern.split(str); - if (!rowMap.containsKey(data[0])) { - rowMap.put(data[0], storingRow); - int storingCol = 0; - for (int s : desiredColPos) { - double d; - try { - d = Double.parseDouble(data[s + columnOffset]); - } catch (NumberFormatException e) { - correctData = false; - d = Double.NaN; - } - matrix.setQuick(storingRow, storingCol, d); - storingCol++; - } - storingRow++; - } else if (rowMap.containsKey(data[0])) { - LOGGER.warning("Duplicated row name!"); - System.out.println("Tried to add: " + data[0]); - throw (doubleMatrixDatasetNonUniqueHeaderException); - } - } - totalRows++; - } - if (!correctData) { - LOGGER.warning("Your data contains NaN/unparseable values!"); - } - in.close(); - - DoubleMatrixDataset dataset = new DoubleMatrixDataset(matrix, rowMap, colMap); - - LOGGER.log(Level.INFO, "''{0}'' has been loaded, nrRows: {1} nrCols: {2}", new Object[]{fileName, dataset.matrix.rows(), dataset.matrix.columns()}); - return dataset; - } - - private static DoubleMatrixDataset loadDoubleBinaryData(String fileName) throws FileNotFoundException, IOException { - //First load the raw binary data: - File fileBinary = new File(fileName + ".dat"); - BufferedInputStream in; - int nrRows; - int nrCols; - in = new BufferedInputStream(new FileInputStream(fileBinary)); - byte[] bytes = new byte[4]; - in.read(bytes, 0, 4); - nrRows = byteArrayToInt(bytes); - in.read(bytes, 0, 4); - nrCols = byteArrayToInt(bytes); - - DoubleMatrix2D matrix; - if ((nrRows * (long) nrCols) < (Integer.MAX_VALUE - 2)) { - matrix = new DenseDoubleMatrix2D(nrRows, nrCols); - } else { - matrix = new DenseLargeDoubleMatrix2D(nrRows, nrCols); - } - - //Now load the row and column identifiers from files - LinkedHashMap rowMap = loadIdentifiers(fileName + ".rows.txt"); - LinkedHashMap colMap = loadIdentifiers(fileName + ".cols.txt"); - - byte[] buffer = new byte[nrCols * 8]; - long bits; - for (int row = 0; row < nrRows; row++) { - in.read(buffer, 0, nrCols * 8); - int bufferLoc = 0; - for (int col = 0; col < nrCols; col++) { - bits = (long) (0xff & buffer[bufferLoc + 7]) - | (long) (0xff & buffer[bufferLoc + 6]) << 8 - | (long) (0xff & buffer[bufferLoc + 5]) << 16 - | (long) (0xff & buffer[bufferLoc + 4]) << 24 - | (long) (0xff & buffer[bufferLoc + 3]) << 32 - | (long) (0xff & buffer[bufferLoc + 2]) << 40 - | (long) (0xff & buffer[bufferLoc + 1]) << 48 - | (long) (buffer[bufferLoc]) << 56; - - matrix.setQuick(row, col, Double.longBitsToDouble(bits)); - bufferLoc += 8; - } - } - in.close(); - - DoubleMatrixDataset dataset = new DoubleMatrixDataset(matrix, rowMap, colMap); - LOGGER.log(Level.INFO, "Binary file ''{0}'' has been loaded, nrRows: {1} nrCols: {2}", new Object[]{fileName, nrRows, nrCols}); - - return dataset; - } - - private static LinkedHashMap loadIdentifiers(String filename) throws IOException { - TextFile tf = new TextFile(filename, false); - String[] rowsArr = tf.readAsArray(); - tf.close(); - LinkedHashMap rowMap = new LinkedHashMap(); - for (String row : rowsArr) { - rowMap.put(row, rowMap.size()); - } - return rowMap; - } - - public void save(File file) throws IOException { - TextFile out = new TextFile(file, TextFile.W); - - out.append('-'); - for (C col : hashCols.keySet()) { - - out.append('\t'); - out.append(col.toString()); - } - out.append('\n'); - int r = 0; - for (R row : hashRows.keySet()) { - out.append(row.toString()); - for (int c = 0; c < matrix.columns(); c++) { - out.append('\t'); - out.append(String.valueOf(matrix.getQuick(r, c))); - } - out.append('\n'); - ++r; - } - out.close(); - } - - public void save(String fileName) throws IOException { - save(new File(fileName)); - } - - public void saveDice(String fileName) throws IOException { - TextFile out = new TextFile(fileName, TextFile.W); - - out.append('-'); - for (R row : hashRows.keySet()) { - out.append('\t'); - out.append(row.toString()); - } - out.append('\n'); - - int c = 0; - for (C col : hashCols.keySet()) { - out.append(col.toString()); - for (int r = 0; r < matrix.rows(); r++) { - - out.append('\t'); - out.append(String.valueOf(matrix.getQuick(r, c))); - } - out.append('\n'); - ++c; - } - out.close(); - } - - private static byte[] intToByteArray(int value) { - return new byte[]{(byte) (value >>> 24), - (byte) (value >>> 16), - (byte) (value >>> 8), - (byte) value}; - } - - private static int byteArrayToInt(byte[] b) { - return (b[0] << 24) - + ((b[1] & 0xff) << 16) - + ((b[2] & 0xff) << 8) - + (b[3] & 0xff); - } - - //Getters and setters - public int rows() { - return matrix.rows(); - } - - public int columns() { - return matrix.columns(); - } - - public LinkedHashMap getHashRows() { - return hashRows; - } - - public void setHashRows(LinkedHashMap hashRows) { - this.hashRows = hashRows; - } - - public LinkedHashMap getHashCols() { - return hashCols; - } - - public void setHashCols(LinkedHashMap hashCols) { - this.hashCols = hashCols; - } - - public ArrayList getRowObjects() { - return new ArrayList(hashRows.keySet()); - } - - public void setRowObjects(List arrayList) throws Exception { - LinkedHashMap newHashRows = new LinkedHashMap((int) Math.ceil(arrayList.size() / 0.75)); - int i = 0; - for (R s : arrayList) { - if (!newHashRows.containsKey(s)) { - newHashRows.put(s, i); - } else { - System.out.println("Error, new row names contains dupilcates."); - throw (doubleMatrixDatasetNonUniqueHeaderException); - } - i++; - } - - this.hashRows = newHashRows; - } - - public ArrayList getColObjects() { - return new ArrayList(hashCols.keySet()); - } - - public void setColObjects(List arrayList) throws Exception { - LinkedHashMap newHashCols = new LinkedHashMap((int) Math.ceil(arrayList.size() / 0.75)); - int i = 0; - for (C s : arrayList) { - if (!newHashCols.containsKey(s)) { - newHashCols.put(s, i); - } else { - System.out.println("Error, new column names contains dupilcates."); - throw (doubleMatrixDatasetNonUniqueHeaderException); - } - i++; - } - this.hashCols = newHashCols; - } - - public DoubleMatrix2D getMatrix() { - return matrix; - } - - public void setMatrix(DoubleMatrix2D matrix) { - this.matrix = matrix; - } - - public void setMatrix(double[][] matrix) { - if ((matrix.length * (long) matrix[0].length) < (Integer.MAX_VALUE - 2)) { - this.matrix = new DenseDoubleMatrix2D(matrix); - } else { - this.matrix = new DenseLargeDoubleMatrix2D(matrix.length, matrix[0].length); - this.matrix.assign(matrix); - } - } - - /** - * Order columns - * - */ - public void OrderOnColumnnames() { - LinkedHashMap newColHash = new LinkedHashMap((int) Math.ceil(this.matrix.columns() / 0.75)); - ArrayList names = this.getColObjects(); - Collections.sort(names); - - int pos = 0; - for (C name : names) { - newColHash.put(name, pos); - pos++; - } - reorderCols(newColHash); - } - - /** - * Order rows - * - */ - public void OrderOnRownames() { - LinkedHashMap newRowHash = new LinkedHashMap((int) Math.ceil(this.matrix.rows() / 0.75)); - ArrayList names = this.getRowObjects(); - Collections.sort(names); - - int pos = -1; - for (R name : names) { - pos++; - newRowHash.put(name, pos); - } - reorderRows(newRowHash); - - } - - public void reorderRows(LinkedHashMap mappingIndex) { - boolean equal = compareHashRows(mappingIndex, this.hashRows); - if (!equal) { - DoubleMatrix2D newRawData; - if ((this.rows() * (long) this.columns()) < (Integer.MAX_VALUE - 2)) { - newRawData = new DenseDoubleMatrix2D(this.rows(), this.columns()); - } else { - newRawData = new DenseLargeDoubleMatrix2D(this.rows(), this.columns()); - } - - for (Map.Entry ent : mappingIndex.entrySet()) { - int pos = this.getHashRows().get(ent.getKey()); - for (int s = 0; s < this.columns(); ++s) { - newRawData.set(ent.getValue(), s, this.getMatrix().get(pos, s)); - } - } - this.setHashRows(mappingIndex); - this.setMatrix(newRawData); - } - - } - - public void reorderCols(LinkedHashMap mappingIndex) { - boolean equal = compareHashCols(mappingIndex, this.hashCols); - if (!equal) { - DoubleMatrix2D newRawData; - if ((this.rows() * (long) this.columns()) < (Integer.MAX_VALUE - 2)) { - newRawData = new DenseDoubleMatrix2D(this.rows(), this.columns()); - } else { - newRawData = new DenseLargeDoubleMatrix2D(this.rows(), this.columns()); - } - - for (Map.Entry ent : mappingIndex.entrySet()) { - int pos = this.getHashCols().get(ent.getKey()); - for (int p = 0; p < this.rows(); ++p) { - newRawData.set(p, ent.getValue(), this.getMatrix().get(p, pos)); - } - } - - this.setHashCols(mappingIndex); - this.setMatrix(newRawData); - } - } - - public DoubleMatrixDataset viewDice() { - return new DoubleMatrixDataset(matrix.viewDice(), hashCols, hashRows); - } - - private boolean compareHashCols(LinkedHashMap mappingIndex, LinkedHashMap originalHash) { - - for (Entry entry : mappingIndex.entrySet()) { - if (entry.getValue() != originalHash.get(entry.getKey())) { - return false; - } - } - return true; - } - - private boolean compareHashRows(LinkedHashMap mappingIndex, LinkedHashMap originalHash) { - - for (Entry entry : mappingIndex.entrySet()) { - if (entry.getValue() != originalHash.get(entry.getKey())) { - return false; - } - } - return true; - } - - /** - * Set a element of the dataset. - * - * @param rowName - * @param columnName - * @param value - */ - public void setElement(R rowName, C columnName, double value) { - - Integer row = hashRows.get(rowName); - Integer column = hashCols.get(columnName); - - if (row != null && column != null) { - matrix.setQuick(row, column, value); - } else { - if (row == null) { - throw new NoSuchElementException("Row not found: " + rowName.toString()); - } else { - throw new NoSuchElementException("Column not found: " + columnName.toString()); - } - - } - - } - - /** - * Get specific element. - * - * @param rowName - * @param columnName - * @return - */ - public double getElement(R rowName, C columnName) { - - Integer row = hashRows.get(rowName); - Integer column = hashCols.get(columnName); - - if (row != null && column != null) { - return matrix.getQuick(row, column); - } else { - if (row == null) { - throw new NoSuchElementException("Row not found: " + rowName.toString()); - } else { - throw new NoSuchElementException("Column not found: " + columnName.toString()); - } - } - } - - /** - * Get specific element. - * - * @param row - * @param column - * @return - */ - public double getElement(int row, int column) { - - return matrix.get(row, column); - } + static final IOException doubleMatrixDatasetNonUniqueHeaderException = new IOException("Tried to use a non-unique header set in an identifier HashMap"); + static final Logger LOGGER = Logger.getLogger(DoubleMatrixDataset.class.getName()); + + public static DoubleMatrixDataset loadDoubleTextData(String expressionDataPath, char c) { + throw new UnsupportedOperationException("Not supported yet."); + } + protected DoubleMatrix2D matrix; + protected LinkedHashMap hashRows; + protected LinkedHashMap hashCols; + + public DoubleMatrixDataset() { + hashRows = new LinkedHashMap(); + hashCols = new LinkedHashMap(); + } + + public DoubleMatrixDataset(int rows, int columns) { + hashRows = new LinkedHashMap((int) Math.ceil(rows / 0.75)); + hashCols = new LinkedHashMap((int) Math.ceil(columns / 0.75)); + if ((rows * (long) columns) < (Integer.MAX_VALUE - 2)) { + matrix = new DenseDoubleMatrix2D(rows, columns); + } else { + matrix = new DenseLargeDoubleMatrix2D(rows, columns); + } + } + + public DoubleMatrixDataset(LinkedHashMap hashRows, LinkedHashMap hashCols) { + this.hashRows = hashRows; + this.hashCols = hashCols; + if ((hashRows.size() * (long) hashCols.size()) < (Integer.MAX_VALUE - 2)) { + matrix = new DenseDoubleMatrix2D(hashRows.size(), hashCols.size()); + } else { + matrix = new DenseLargeDoubleMatrix2D(hashRows.size(), hashCols.size()); + } + } + + public DoubleMatrixDataset(DoubleMatrix2D matrix, LinkedHashMap hashRows, LinkedHashMap hashCols) { + this.hashRows = hashRows; + this.hashCols = hashCols; + this.matrix = matrix; + } + + public DoubleMatrixDataset(Collection rowNames, Collection colNames) { + + hashRows = new LinkedHashMap(rowNames.size()); + hashCols = new LinkedHashMap(colNames.size()); + + int i = 0; + for (R row : rowNames) { + hashRows.put(row, i); + ++i; + } + + i = 0; + for (C col : colNames) { + hashCols.put(col, i); + ++i; + } + + if ((hashRows.size() * (long) hashCols.size()) < (Integer.MAX_VALUE - 2)) { + matrix = new DenseDoubleMatrix2D(hashRows.size(), hashCols.size()); + } else { + matrix = new DenseLargeDoubleMatrix2D(hashRows.size(), hashCols.size()); + } + + } + + public static DoubleMatrixDataset loadDoubleData(String fileName) throws IOException { + if ((fileName.endsWith(".txt") || fileName.endsWith(".tsv") || fileName.endsWith(".txt.gz"))) { + return loadDoubleTextData(fileName, "\t"); + } else if (fileName.endsWith(".binary")) { + return loadDoubleBinaryData(fileName); + } else { + throw new IllegalArgumentException("File type must be \".txt\", \".tsv\" or \".txt.gz\" when delimiter is set to: \"tab\" \n Input filename: " + fileName); + } + } + + public static DoubleMatrixDataset loadDoubleTextData(String fileName, String delimiter) throws IOException { + if (!(fileName.endsWith(".txt") || fileName.endsWith(".tsv") || fileName.endsWith(".txt.gz"))) { + throw new IllegalArgumentException("File type must be \".txt\", \".tsv\" or \".txt.gz\" when delimiter is set. \n Input filename: " + fileName); + } + + Pattern splitPatern = Pattern.compile(delimiter); + + int columnOffset = 1; + + TextFile in = new TextFile(fileName, TextFile.R); + String str = in.readLine(); // header + String[] data = splitPatern.split(str); + + int tmpCols = (data.length - columnOffset); + + LinkedHashMap colMap = new LinkedHashMap((int) Math.ceil(tmpCols / 0.75)); + + for (int s = 0; s < tmpCols; s++) { + String colName = data[s + columnOffset]; + if (!colMap.containsKey(colName)) { + colMap.put(colName, s); + } else { + LOGGER.warning("Duplicated column name!"); + throw (doubleMatrixDatasetNonUniqueHeaderException); + } + } + + int tmpRows = 0; + + while (in.readLine() != null) { + tmpRows++; + } + in.close(); + + LinkedHashMap rowMap = new LinkedHashMap((int) Math.ceil(tmpRows / 0.75)); + DoubleMatrix2D tmpMatrix; + + if ((tmpRows * (long) tmpCols) < (Integer.MAX_VALUE - 2)) { + tmpMatrix = new DenseDoubleMatrix2D(tmpRows, tmpCols); + } else { + tmpMatrix = new DenseLargeDoubleMatrix2D(tmpRows, tmpCols); + } + in.open(); + in.readLine(); // read header + int row = 0; + + boolean correctData = true; + while ((str = in.readLine()) != null) { + data = splitPatern.split(str); + + if (!rowMap.containsKey(data[0])) { + rowMap.put(data[0], row); + for (int s = 0; s < tmpCols; s++) { + double d; + try { + d = Double.parseDouble(data[s + columnOffset]); + } catch (NumberFormatException e) { + correctData = false; + d = Double.NaN; + } + tmpMatrix.setQuick(row, s, d); + } + row++; + } else { + LOGGER.warning("Duplicated row name!"); + throw (doubleMatrixDatasetNonUniqueHeaderException); + } + + } + if (!correctData) { + LOGGER.warning("Your data contains NaN/unparseable values!"); + } + in.close(); + + DoubleMatrixDataset dataset = new DoubleMatrixDataset(tmpMatrix, rowMap, colMap); + + LOGGER.log(Level.INFO, "''{0}'' has been loaded, nrRows: {1} nrCols: {2}", new Object[]{fileName, dataset.matrix.rows(), dataset.matrix.columns()}); + return dataset; + } + + public static DoubleMatrixDataset loadSubsetOfTextDoubleData(String fileName, String delimiter, HashSet desiredRows, HashSet desiredCols) throws IOException { + if (!(fileName.endsWith(".txt") || fileName.endsWith(".txt.gz"))) { + throw new IllegalArgumentException("File type must be .txt when delimiter is given (given filename: " + fileName + ")"); + } + + LinkedHashSet desiredColPos = new LinkedHashSet(); + + Pattern splitPatern = Pattern.compile(delimiter); + + int columnOffset = 1; + + TextFile in = new TextFile(fileName, TextFile.R); + String str = in.readLine(); // header + String[] data = splitPatern.split(str); + + int tmpCols = (data.length - columnOffset); + + LinkedHashMap colMap = new LinkedHashMap((int) Math.ceil(tmpCols / 0.75)); + + int storedCols = 0; + for (int s = 0; s < tmpCols; s++) { + String colName = data[s + columnOffset]; + if (!colMap.containsKey(colName) && (desiredCols == null || desiredCols.contains(colName) || desiredCols.isEmpty())) { + colMap.put(colName, storedCols); + desiredColPos.add((s)); + storedCols++; + } else if (colMap.containsKey(colName)) { + LOGGER.warning("Duplicated column name!"); + System.out.println("Tried to add: " + colName); + throw (doubleMatrixDatasetNonUniqueHeaderException); + } + } + + LinkedHashSet desiredRowPos = new LinkedHashSet(); + int rowsToStore = 0; + int totalRows = 0; + //System.out.println(desiredRows.toString()); + while ((str = in.readLine()) != null) { + String[] info = splitPatern.split(str); + if (desiredRows == null || desiredRows.contains(info[0]) || desiredRows.isEmpty()) { + rowsToStore++; + desiredRowPos.add(totalRows); + } + totalRows++; + } + in.close(); + + DoubleMatrix2D matrix; + if ((rowsToStore * (long) tmpCols) < (Integer.MAX_VALUE - 2)) { + matrix = new DenseDoubleMatrix2D(rowsToStore, storedCols); + } else { + matrix = new DenseLargeDoubleMatrix2D(rowsToStore, storedCols); + } + + in.open(); + in.readLine(); // read header + int storingRow = 0; + totalRows = 0; + LinkedHashMap rowMap = new LinkedHashMap((int) Math.ceil(rowsToStore / 0.75)); + + boolean correctData = true; + while ((str = in.readLine()) != null) { + + if (desiredRowPos.contains(totalRows)) { + data = splitPatern.split(str); + if (!rowMap.containsKey(data[0])) { + rowMap.put(data[0], storingRow); + int storingCol = 0; + for (int s : desiredColPos) { + double d; + try { + d = Double.parseDouble(data[s + columnOffset]); + } catch (NumberFormatException e) { + correctData = false; + d = Double.NaN; + } + matrix.setQuick(storingRow, storingCol, d); + storingCol++; + } + storingRow++; + } else if (rowMap.containsKey(data[0])) { + LOGGER.warning("Duplicated row name!"); + System.out.println("Tried to add: " + data[0]); + throw (doubleMatrixDatasetNonUniqueHeaderException); + } + } + totalRows++; + } + if (!correctData) { + LOGGER.warning("Your data contains NaN/unparseable values!"); + } + in.close(); + + DoubleMatrixDataset dataset = new DoubleMatrixDataset(matrix, rowMap, colMap); + + LOGGER.log(Level.INFO, "''{0}'' has been loaded, nrRows: {1} nrCols: {2}", new Object[]{fileName, dataset.matrix.rows(), dataset.matrix.columns()}); + return dataset; + } + + private static DoubleMatrixDataset loadDoubleBinaryData(String fileName) throws FileNotFoundException, IOException { + //First load the raw binary data: + File fileBinary = new File(fileName + ".dat"); + BufferedInputStream in; + int nrRows; + int nrCols; + in = new BufferedInputStream(new FileInputStream(fileBinary)); + byte[] bytes = new byte[4]; + in.read(bytes, 0, 4); + nrRows = byteArrayToInt(bytes); + in.read(bytes, 0, 4); + nrCols = byteArrayToInt(bytes); + + DoubleMatrix2D matrix; + if ((nrRows * (long) nrCols) < (Integer.MAX_VALUE - 2)) { + matrix = new DenseDoubleMatrix2D(nrRows, nrCols); + } else { + matrix = new DenseLargeDoubleMatrix2D(nrRows, nrCols); + } + + //Now load the row and column identifiers from files + LinkedHashMap rowMap = loadIdentifiers(fileName + ".rows.txt"); + LinkedHashMap colMap = loadIdentifiers(fileName + ".cols.txt"); + + byte[] buffer = new byte[nrCols * 8]; + long bits; + for (int row = 0; row < nrRows; row++) { + in.read(buffer, 0, nrCols * 8); + int bufferLoc = 0; + for (int col = 0; col < nrCols; col++) { + bits = (long) (0xff & buffer[bufferLoc + 7]) + | (long) (0xff & buffer[bufferLoc + 6]) << 8 + | (long) (0xff & buffer[bufferLoc + 5]) << 16 + | (long) (0xff & buffer[bufferLoc + 4]) << 24 + | (long) (0xff & buffer[bufferLoc + 3]) << 32 + | (long) (0xff & buffer[bufferLoc + 2]) << 40 + | (long) (0xff & buffer[bufferLoc + 1]) << 48 + | (long) (buffer[bufferLoc]) << 56; + + matrix.setQuick(row, col, Double.longBitsToDouble(bits)); + bufferLoc += 8; + } + } + in.close(); + + DoubleMatrixDataset dataset = new DoubleMatrixDataset(matrix, rowMap, colMap); + LOGGER.log(Level.INFO, "Binary file ''{0}'' has been loaded, nrRows: {1} nrCols: {2}", new Object[]{fileName, nrRows, nrCols}); + + return dataset; + } + + private static LinkedHashMap loadIdentifiers(String filename) throws IOException { + TextFile tf = new TextFile(filename, false); + String[] rowsArr = tf.readAsArray(); + tf.close(); + LinkedHashMap rowMap = new LinkedHashMap(); + for (String row : rowsArr) { + rowMap.put(row, rowMap.size()); + } + return rowMap; + } + + public void save(File file) throws IOException { + TextFile out = new TextFile(file, TextFile.W); + + out.append('-'); + for (C col : hashCols.keySet()) { + + out.append('\t'); + out.append(col.toString()); + } + out.append('\n'); + int r = 0; + for (R row : hashRows.keySet()) { + out.append(row.toString()); + for (int c = 0; c < matrix.columns(); c++) { + out.append('\t'); + out.append(String.valueOf(matrix.getQuick(r, c))); + } + out.append('\n'); + ++r; + } + out.close(); + } + + public void save(String fileName) throws IOException { + save(new File(fileName)); + } + + public void saveDice(String fileName) throws IOException { + TextFile out = new TextFile(fileName, TextFile.W); + + out.append('-'); + for (R row : hashRows.keySet()) { + out.append('\t'); + out.append(row.toString()); + } + out.append('\n'); + + int c = 0; + for (C col : hashCols.keySet()) { + out.append(col.toString()); + for (int r = 0; r < matrix.rows(); r++) { + + out.append('\t'); + out.append(String.valueOf(matrix.getQuick(r, c))); + } + out.append('\n'); + ++c; + } + out.close(); + } + + private static byte[] intToByteArray(int value) { + return new byte[]{(byte) (value >>> 24), + (byte) (value >>> 16), + (byte) (value >>> 8), + (byte) value}; + } + + private static int byteArrayToInt(byte[] b) { + return (b[0] << 24) + + ((b[1] & 0xff) << 16) + + ((b[2] & 0xff) << 8) + + (b[3] & 0xff); + } + + //Getters and setters + public int rows() { + return matrix.rows(); + } + + public int columns() { + return matrix.columns(); + } + + public LinkedHashMap getHashRows() { + return hashRows; + } + + public void setHashRows(LinkedHashMap hashRows) { + this.hashRows = hashRows; + } + + public LinkedHashMap getHashCols() { + return hashCols; + } + + public void setHashCols(LinkedHashMap hashCols) { + this.hashCols = hashCols; + } + + public ArrayList getRowObjects() { + return new ArrayList(hashRows.keySet()); + } + + public void setRowObjects(List arrayList) throws Exception { + LinkedHashMap newHashRows = new LinkedHashMap((int) Math.ceil(arrayList.size() / 0.75)); + int i = 0; + for (R s : arrayList) { + if (!newHashRows.containsKey(s)) { + newHashRows.put(s, i); + } else { + System.out.println("Error, new row names contains dupilcates."); + throw (doubleMatrixDatasetNonUniqueHeaderException); + } + i++; + } + + this.hashRows = newHashRows; + } + + public ArrayList getColObjects() { + return new ArrayList(hashCols.keySet()); + } + + public void setColObjects(List arrayList) throws Exception { + LinkedHashMap newHashCols = new LinkedHashMap((int) Math.ceil(arrayList.size() / 0.75)); + int i = 0; + for (C s : arrayList) { + if (!newHashCols.containsKey(s)) { + newHashCols.put(s, i); + } else { + System.out.println("Error, new column names contains dupilcates."); + throw (doubleMatrixDatasetNonUniqueHeaderException); + } + i++; + } + this.hashCols = newHashCols; + } + + public DoubleMatrix2D getMatrix() { + return matrix; + } + + public void setMatrix(DoubleMatrix2D matrix) { + this.matrix = matrix; + } + + public void setMatrix(double[][] matrix) { + if ((matrix.length * (long) matrix[0].length) < (Integer.MAX_VALUE - 2)) { + this.matrix = new DenseDoubleMatrix2D(matrix); + } else { + this.matrix = new DenseLargeDoubleMatrix2D(matrix.length, matrix[0].length); + this.matrix.assign(matrix); + } + } + + /** + * Order columns + * + */ + public void OrderOnColumnnames() { + LinkedHashMap newColHash = new LinkedHashMap((int) Math.ceil(this.matrix.columns() / 0.75)); + ArrayList names = this.getColObjects(); + Collections.sort(names); + + int pos = 0; + for (C name : names) { + newColHash.put(name, pos); + pos++; + } + reorderCols(newColHash); + } + + /** + * Order rows + * + */ + public void OrderOnRownames() { + LinkedHashMap newRowHash = new LinkedHashMap((int) Math.ceil(this.matrix.rows() / 0.75)); + ArrayList names = this.getRowObjects(); + Collections.sort(names); + + int pos = -1; + for (R name : names) { + pos++; + newRowHash.put(name, pos); + } + reorderRows(newRowHash); + + } + + public void reorderRows(LinkedHashMap mappingIndex) { + boolean equal = compareHashRows(mappingIndex, this.hashRows); + if (!equal) { + DoubleMatrix2D newRawData; + if ((this.rows() * (long) this.columns()) < (Integer.MAX_VALUE - 2)) { + newRawData = new DenseDoubleMatrix2D(this.rows(), this.columns()); + } else { + newRawData = new DenseLargeDoubleMatrix2D(this.rows(), this.columns()); + } + + for (Map.Entry ent : mappingIndex.entrySet()) { + int pos = this.getHashRows().get(ent.getKey()); + for (int s = 0; s < this.columns(); ++s) { + newRawData.set(ent.getValue(), s, this.getMatrix().get(pos, s)); + } + } + this.setHashRows(mappingIndex); + this.setMatrix(newRawData); + } + + } + + public void reorderCols(LinkedHashMap mappingIndex) { + boolean equal = compareHashCols(mappingIndex, this.hashCols); + if (!equal) { + DoubleMatrix2D newRawData; + if ((this.rows() * (long) this.columns()) < (Integer.MAX_VALUE - 2)) { + newRawData = new DenseDoubleMatrix2D(this.rows(), this.columns()); + } else { + newRawData = new DenseLargeDoubleMatrix2D(this.rows(), this.columns()); + } + + for (Map.Entry ent : mappingIndex.entrySet()) { + int pos = this.getHashCols().get(ent.getKey()); + for (int p = 0; p < this.rows(); ++p) { + newRawData.set(p, ent.getValue(), this.getMatrix().get(p, pos)); + } + } + + this.setHashCols(mappingIndex); + this.setMatrix(newRawData); + } + } + + public DoubleMatrixDataset viewDice() { + return new DoubleMatrixDataset(matrix.viewDice(), hashCols, hashRows); + } + + private boolean compareHashCols(LinkedHashMap mappingIndex, LinkedHashMap originalHash) { + + for (Entry entry : mappingIndex.entrySet()) { + if (entry.getValue() != originalHash.get(entry.getKey())) { + return false; + } + } + return true; + } + + private boolean compareHashRows(LinkedHashMap mappingIndex, LinkedHashMap originalHash) { + + for (Entry entry : mappingIndex.entrySet()) { + if (entry.getValue() != originalHash.get(entry.getKey())) { + return false; + } + } + return true; + } + + /** + * Set a element of the dataset. + * + * @param rowName + * @param columnName + * @param value + */ + public void setElement(R rowName, C columnName, double value) { + + Integer row = hashRows.get(rowName); + Integer column = hashCols.get(columnName); + + if (row != null && column != null) { + matrix.setQuick(row, column, value); + } else { + if (row == null) { + throw new NoSuchElementException("Row not found: " + rowName.toString()); + } else { + throw new NoSuchElementException("Column not found: " + columnName.toString()); + } + + } + + } + + /** + * Get specific element. + * + * @param rowName + * @param columnName + * @return + */ + public double getElement(R rowName, C columnName) { + + Integer row = hashRows.get(rowName); + Integer column = hashCols.get(columnName); + + if (row != null && column != null) { + return matrix.getQuick(row, column); + } else { + if (row == null) { + throw new NoSuchElementException("Row not found: " + rowName.toString()); + } else { + throw new NoSuchElementException("Column not found: " + columnName.toString()); + } + } + } + + public DoubleMatrix1D getRow (R rowName){ + Integer row = hashRows.get(rowName); + if (row != null){ + return matrix.viewRow(row); + } else { + throw new NoSuchElementException("Row not found: " + rowName.toString()); + } + } + + /** + * Get specific element. + * + * @param row + * @param column + * @return + */ + public double getElement(int row, int column) { + + return matrix.get(row, column); + } + + public boolean containsRow(R rowId){ + return hashRows.containsKey(rowId); + } + + public boolean containsCol(C colId){ + return hashCols.containsKey(colId); + } } diff --git a/genetica-libraries/src/main/java/umcg/genetica/math/stats/Heterogeneity.java b/genetica-libraries/src/main/java/umcg/genetica/math/stats/Heterogeneity.java index 5fb796f7c..015b2de06 100644 --- a/genetica-libraries/src/main/java/umcg/genetica/math/stats/Heterogeneity.java +++ b/genetica-libraries/src/main/java/umcg/genetica/math/stats/Heterogeneity.java @@ -88,7 +88,7 @@ public static Pair getISq(double[] datasetZ, int[] datasetWeight double hetSum = 0; int hetDf = 0; for (int d = 0; d < datasetZ.length; d++) { - if (Double.isNaN(datasetZ[d])) { + if (!Double.isNaN(datasetZ[d])) { double expectedZ = Math.sqrt(datasetWeights[d]) * weightedZ / totalSample; hetSum += (datasetZ[d] - expectedZ) * (datasetZ[d] - expectedZ); diff --git a/genetica-libraries/src/main/java/umcg/genetica/math/stats/QuantileNormalization.java b/genetica-libraries/src/main/java/umcg/genetica/math/stats/QuantileNormalization.java index da40e6244..0b2bf4af0 100644 --- a/genetica-libraries/src/main/java/umcg/genetica/math/stats/QuantileNormalization.java +++ b/genetica-libraries/src/main/java/umcg/genetica/math/stats/QuantileNormalization.java @@ -7,15 +7,20 @@ import java.util.ArrayList; import java.util.Arrays; import org.apache.commons.collections.primitives.ArrayDoubleList; +import org.apache.commons.math3.stat.ranking.NaNStrategy; +import org.apache.commons.math3.stat.ranking.NaturalRanking; +import org.apache.commons.math3.stat.ranking.RankingAlgorithm; +import org.apache.commons.math3.stat.ranking.TiesStrategy; import umcg.genetica.math.matrix.DoubleMatrixDataset; import umcg.genetica.util.RankArray; + /** * * @author Harm Jan & Marc Jan Bonder */ public class QuantileNormalization { - + private static final RankingAlgorithm COV_RANKER_TIE = new NaturalRanking(NaNStrategy.FAILED, TiesStrategy.AVERAGE); /** * Quantile normalize a double[][] double[probes][sample] * @@ -46,30 +51,35 @@ public static void quantilenormalize(double[][] rawData) { for (int probeID = 0; probeID < probeCount; probeID++) { rankedMean[probeID] /= (double) sampleCount; } + + double[] rankedMeanClasses = new double[probeCount-1]; + + for (int probeID = 0; probeID < (probeCount-1); probeID++) { + rankedMeanClasses[probeID] = ((rankedMean[probeID]+rankedMean[probeID+1])/2); + } - RankArray rda = new RankArray(); //Iterate through each sample: for (int s = 0; s < sampleCount; s++) { double[] probes = new double[probeCount]; for (int p = 0; p < probeCount; p++) { probes[p] = rawData[p][s]; } - double[] probesRanked = rda.rank(probes, true); + double[] probesRanked = COV_RANKER_TIE.rank(probes); double[] probesQuantileNormalized = new double[probeCount]; for (int p = 0; p < probeCount; p++) { if((probesRanked[p]%1)!=0){ - probesQuantileNormalized[p] = ((rankedMean[(int)Math.floor(probesRanked[p])]+rankedMean[(int)Math.ceil(probesRanked[p])])/2); - rawData[p][s] = probesQuantileNormalized[p]; + probesQuantileNormalized[p] = rankedMeanClasses[(int)Math.floor((probesRanked[p]-1))]; } else { - probesQuantileNormalized[p] = rankedMean[(int) probesRanked[p]]; - rawData[p][s] = probesQuantileNormalized[p]; + probesQuantileNormalized[p] = rankedMean[(int) (probesRanked[p]-1)]; } + + rawData[p][s] = probesQuantileNormalized[p]; } // double[] probesRankedAfterQQNorm = rda.rank(probesQuantileNormalized, false); - System.out.println("Normalized sample:\t" + (s+1) + "\tCorrelation original data and ranked data:\t" + JSci.maths.ArrayMath.correlation(probes, probesRanked) + "\tCorrelation original data and quantile normalized data:\t" + JSci.maths.ArrayMath.correlation(probes, probesQuantileNormalized) + "\tSpearman: "+spearman.correlation(probes, probesQuantileNormalized)); + System.out.println("Normalized sample:\t" + (s+1) + "\tPearson correlation original data and ranked data:\t" + JSci.maths.ArrayMath.correlation(probes, probesRanked) + "\ttSpearman correlation original data and quantile normalized data:\t"+spearman.correlation(probes, probesQuantileNormalized)); } } diff --git a/genetica-libraries/src/main/java/umcg/genetica/math/stats/ZScores.java b/genetica-libraries/src/main/java/umcg/genetica/math/stats/ZScores.java index 2aa378967..4175551f9 100644 --- a/genetica-libraries/src/main/java/umcg/genetica/math/stats/ZScores.java +++ b/genetica-libraries/src/main/java/umcg/genetica/math/stats/ZScores.java @@ -148,6 +148,20 @@ public static double pToZ(double p) { return Probability.normalInverse(p); } + + /** + * + * Returns the absolute Z-score for a given p-value using a normal + * distribution. + * + * @param p p-value + * @return absolute Z-score + */ + public static double pToZTwoTailed(double p) { + + p = p/2; + return pToZ(p); + } /** * diff --git a/genetica-libraries/src/main/java/umcg/genetica/methylation/ConvertBetaAndMvalues.java b/genetica-libraries/src/main/java/umcg/genetica/methylation/ConvertBetaAndMvalues.java index 877cd4930..cfdb225ee 100644 --- a/genetica-libraries/src/main/java/umcg/genetica/methylation/ConvertBetaAndMvalues.java +++ b/genetica-libraries/src/main/java/umcg/genetica/methylation/ConvertBetaAndMvalues.java @@ -58,6 +58,15 @@ public static void transformMToBetavalue(DoubleMatrix2D rawData){ } } + public static double[] transformMToBetavalue(double[] rawData){ + double[] betaCopy = new double[rawData.length]; + for (int s=0; s fixedValues = new HashSet(); - - for(int i=0; i fixedValues = new HashSet(); +// +// for(int i=0; i fixedValues = new HashSet(); diff --git a/pom.xml b/pom.xml index f8da03aba..afe972e4f 100644 --- a/pom.xml +++ b/pom.xml @@ -82,5 +82,6 @@ eqtl-functional-enrichment GeneticRiskScoreCalculator BinaryMetaAnalyzer - + eQTLInteractionAnalyser + \ No newline at end of file