diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..70a550b Binary files /dev/null and b/.DS_Store differ diff --git a/build.xml b/build.xml new file mode 100644 index 0000000..1af8d10 --- /dev/null +++ b/build.xml @@ -0,0 +1,74 @@ + + + + + + + + + + + Builds, tests, and runs the project ExpressionTable. + + + diff --git a/build/built-jar.properties b/build/built-jar.properties new file mode 100644 index 0000000..23598fd --- /dev/null +++ b/build/built-jar.properties @@ -0,0 +1,12 @@ +#Thu, 20 Jun 2013 16:45:22 +0400 + + +/Users/dashazhernakova/Documents/NetBeansProjects/ExpressionTable= + +/Users/dashazhernakova/Documents/NetBeansProjects/Correlation= + +/Users/dashazhernakova/Documents/NetBeansProjects/GeneticaLibraries= + +/Users/dashazhernakova/Documents/NetBeansProjects/eqtlmappingpipeline= + +/Users/dashazhernakova/Documents/NetBeansProjects/processTmap= diff --git a/build/classes/expressiontable/Coexpression.class b/build/classes/expressiontable/Coexpression.class new file mode 100644 index 0000000..fb5772e Binary files /dev/null and b/build/classes/expressiontable/Coexpression.class differ diff --git a/build/classes/expressiontable/ExpressionTable.class b/build/classes/expressiontable/ExpressionTable.class new file mode 100644 index 0000000..9e65a70 Binary files /dev/null and b/build/classes/expressiontable/ExpressionTable.class differ diff --git a/build/classes/expressiontable/Joiner.class b/build/classes/expressiontable/Joiner.class new file mode 100644 index 0000000..0b03a44 Binary files /dev/null and b/build/classes/expressiontable/Joiner.class differ diff --git a/build/classes/expressiontable/Normalizer.class b/build/classes/expressiontable/Normalizer.class new file mode 100644 index 0000000..7606bcf Binary files /dev/null and b/build/classes/expressiontable/Normalizer.class differ diff --git a/build/classes/expressiontable/ProbeToGeneConverter.class b/build/classes/expressiontable/ProbeToGeneConverter.class new file mode 100644 index 0000000..2daea92 Binary files /dev/null and b/build/classes/expressiontable/ProbeToGeneConverter.class differ diff --git a/build/classes/expressiontable/Sorter$ValueComparator.class b/build/classes/expressiontable/Sorter$ValueComparator.class new file mode 100644 index 0000000..88fb5fc Binary files /dev/null and b/build/classes/expressiontable/Sorter$ValueComparator.class differ diff --git a/build/classes/expressiontable/Sorter.class b/build/classes/expressiontable/Sorter.class new file mode 100644 index 0000000..cbb43be Binary files /dev/null and b/build/classes/expressiontable/Sorter.class differ diff --git a/build/classes/expressiontable/Subtable.class b/build/classes/expressiontable/Subtable.class new file mode 100644 index 0000000..e1434f8 Binary files /dev/null and b/build/classes/expressiontable/Subtable.class differ diff --git a/dist/ExpressionTable.jar b/dist/ExpressionTable.jar new file mode 100644 index 0000000..9247a52 Binary files /dev/null and b/dist/ExpressionTable.jar differ diff --git a/dist/README.TXT b/dist/README.TXT new file mode 100644 index 0000000..e9e3676 --- /dev/null +++ b/dist/README.TXT @@ -0,0 +1,32 @@ +======================== +BUILD OUTPUT DESCRIPTION +======================== + +When you build an Java application project that has a main class, the IDE +automatically copies all of the JAR +files on the projects classpath to your projects dist/lib folder. The IDE +also adds each of the JAR files to the Class-Path element in the application +JAR files manifest file (MANIFEST.MF). + +To run the project from the command line, go to the dist folder and +type the following: + +java -jar "ExpressionTable.jar" + +To distribute this project, zip up the dist folder (including the lib folder) +and distribute the ZIP file. + +Notes: + +* If two JAR files on the project classpath have the same name, only the first +JAR file is copied to the lib folder. +* Only JAR files are copied to the lib folder. +If the classpath contains other types of files or folders, these files (folders) +are not copied. +* If a library on the projects classpath also has a Class-Path element +specified in the manifest,the content of the Class-Path element has to be on +the projects runtime path. +* To set a main class in a standard Java project, right-click the project node +in the Projects window and choose Properties. Then click Run and enter the +class name in the Main Class field. Alternatively, you can manually type the +class name in the manifest Main-Class element. diff --git a/dist/lib/Correlation.jar b/dist/lib/Correlation.jar new file mode 100644 index 0000000..33e0caf Binary files /dev/null and b/dist/lib/Correlation.jar differ diff --git a/dist/lib/GeneticaLibraries.jar b/dist/lib/GeneticaLibraries.jar new file mode 100644 index 0000000..78afa14 Binary files /dev/null and b/dist/lib/GeneticaLibraries.jar differ diff --git a/dist/lib/colt.jar b/dist/lib/colt.jar new file mode 100644 index 0000000..a7192f6 Binary files /dev/null and b/dist/lib/colt.jar differ diff --git a/dist/lib/commons-math-2.1.jar b/dist/lib/commons-math-2.1.jar new file mode 100644 index 0000000..43b4b36 Binary files /dev/null and b/dist/lib/commons-math-2.1.jar differ diff --git a/dist/lib/eQTLMappingPipeline.jar b/dist/lib/eQTLMappingPipeline.jar new file mode 100644 index 0000000..99708aa Binary files /dev/null and b/dist/lib/eQTLMappingPipeline.jar differ diff --git a/dist/lib/jsc.jar b/dist/lib/jsc.jar new file mode 100644 index 0000000..88d2af0 Binary files /dev/null and b/dist/lib/jsc.jar differ diff --git a/dist/lib/jscicore.jar b/dist/lib/jscicore.jar new file mode 100644 index 0000000..2fdca39 Binary files /dev/null and b/dist/lib/jscicore.jar differ diff --git a/manifest.mf b/manifest.mf new file mode 100644 index 0000000..328e8e5 --- /dev/null +++ b/manifest.mf @@ -0,0 +1,3 @@ +Manifest-Version: 1.0 +X-COMMENT: Main-Class will be added automatically by build + diff --git a/nbproject/build-impl.xml b/nbproject/build-impl.xml new file mode 100644 index 0000000..404c3b0 --- /dev/null +++ b/nbproject/build-impl.xml @@ -0,0 +1,1453 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must set src.dir + Must set test.src.dir + Must set build.dir + Must set dist.dir + Must set build.classes.dir + Must set dist.javadoc.dir + Must set build.test.classes.dir + Must set build.test.results.dir + Must set build.classes.excludes + Must set dist.jar + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must set javac.includes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + No tests executed. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must set JVM to use for profiling in profiler.info.jvm + Must set profiler agent JVM arguments in profiler.info.jvmargs.agent + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must select some files in the IDE or set javac.includes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + To run this application from the command line without Ant, try: + + + + + + + java -cp "${run.classpath.with.dist.jar}" ${main.class} + + + + + + + + + + + + + + + + + + + + + + + + + To run this application from the command line without Ant, try: + + java -jar "${dist.jar.resolved}" + + + + + + + + + + + + + + + + + + + + + + + + + Must select one file in the IDE or set run.class + + + + Must select one file in the IDE or set run.class + + + + + + + + + + + + + + + + + + + + + + + Must select one file in the IDE or set debug.class + + + + + Must select one file in the IDE or set debug.class + + + + + Must set fix.includes + + + + + + + + + + This target only works when run from inside the NetBeans IDE. + + + + + + + + + Must select one file in the IDE or set profile.class + This target only works when run from inside the NetBeans IDE. + + + + + + + + + This target only works when run from inside the NetBeans IDE. + + + + + + + + + + + + + This target only works when run from inside the NetBeans IDE. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must select one file in the IDE or set run.class + + + + + + Must select some files in the IDE or set test.includes + + + + + Must select one file in the IDE or set run.class + + + + + Must select one file in the IDE or set applet.url + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must select some files in the IDE or set javac.includes + + + + + + + + + + + + + + + + + + + + Some tests failed; see details above. + + + + + + + + + Must select some files in the IDE or set test.includes + + + + Some tests failed; see details above. + + + + Must select some files in the IDE or set test.class + Must select some method in the IDE or set test.method + + + + Some tests failed; see details above. + + + + + Must select one file in the IDE or set test.class + + + + Must select one file in the IDE or set test.class + Must select some method in the IDE or set test.method + + + + + + + + + + + + + + Must select one file in the IDE or set applet.url + + + + + + + + + Must select one file in the IDE or set applet.url + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/nbproject/genfiles.properties b/nbproject/genfiles.properties new file mode 100644 index 0000000..15fcb32 --- /dev/null +++ b/nbproject/genfiles.properties @@ -0,0 +1,8 @@ +build.xml.data.CRC32=b6a6e4b5 +build.xml.script.CRC32=6d71407f +build.xml.stylesheet.CRC32=28e38971@1.44.1.45 +# This file is used by a NetBeans-based IDE to track changes in generated files such as build-impl.xml. +# Do not edit this file. You may delete it but then the IDE will never regenerate such files for you. +nbproject/build-impl.xml.data.CRC32=b6a6e4b5 +nbproject/build-impl.xml.script.CRC32=0e5d9718 +nbproject/build-impl.xml.stylesheet.CRC32=c6d2a60f@1.56.0.46 diff --git a/nbproject/private/private.properties b/nbproject/private/private.properties new file mode 100644 index 0000000..b451a9b --- /dev/null +++ b/nbproject/private/private.properties @@ -0,0 +1,2 @@ +compile.on.save=true +user.properties.file=/Users/dashazhernakova/Library/Application Support/NetBeans/7.2.1/build.properties diff --git a/nbproject/private/private.xml b/nbproject/private/private.xml new file mode 100644 index 0000000..8505fc1 --- /dev/null +++ b/nbproject/private/private.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/nbproject/private/profiler/configurations.xml b/nbproject/private/profiler/configurations.xml new file mode 100644 index 0000000..b01b9a1 --- /dev/null +++ b/nbproject/private/profiler/configurations.xml @@ -0,0 +1,110 @@ + + + +1000 +false +profiler.simple.filter +false + +8 +true + +false +0 +false +true +1 +false +false +false +profiler.simple.filter +32 +false +1 +true +3 +10 +1 +true +Analyze Memory +false +1 +true +10 +0 +profiler.simple.filter +0 +false +true + + +1 +true + + +false +false +true +false +false +32 +Quick filter... +0 +false +0 +{$project.classes.only} +10 +0 +true +true + +true +10 + +1000 +0 +profiler.simple.filter +false +Analyze Performance + +1 +0 + +0 +false +profiler.simple.filter +Quick filter... +true +false +0 +true + +2 +32 + +0 +false +Profile only project classes +0 +0 +profiler.simple.filter +true +1 +false +10 +false +10 +false +true +true +false +Quick filter... +0 +false + +2 +Monitor Application +1000 +true +true + diff --git a/nbproject/project.properties b/nbproject/project.properties new file mode 100644 index 0000000..383121d --- /dev/null +++ b/nbproject/project.properties @@ -0,0 +1,88 @@ +annotation.processing.enabled=true +annotation.processing.enabled.in.editor=false +annotation.processing.processor.options= +annotation.processing.processors.list= +annotation.processing.run.all.processors=true +annotation.processing.source.output=${build.generated.sources.dir}/ap-source-output +build.classes.dir=${build.dir}/classes +build.classes.excludes=**/*.java,**/*.form +# This directory is removed when the project is cleaned: +build.dir=build +build.generated.dir=${build.dir}/generated +build.generated.sources.dir=${build.dir}/generated-sources +# Only compile against the classpath explicitly listed here: +build.sysclasspath=ignore +build.test.classes.dir=${build.dir}/test/classes +build.test.results.dir=${build.dir}/test/results +# Uncomment to specify the preferred debugger connection transport: +#debug.transport=dt_socket +debug.classpath=\ + ${run.classpath} +debug.test.classpath=\ + ${run.test.classpath} +# This directory is removed when the project is cleaned: +dist.dir=dist +dist.jar=${dist.dir}/ExpressionTable.jar +dist.javadoc.dir=${dist.dir}/javadoc +excludes= +file.reference.colt.jar=/Users/dashazhernakova/lib/colt.jar +file.reference.commons-math-2.1.jar=/Users/dashazhernakova/lib/commons-math-2.1.jar +file.reference.jsc.jar=/Users/dashazhernakova/lib/jsc.jar +file.reference.jscicore.jar=/Users/dashazhernakova/lib/jscicore.jar +includes=** +jar.compress=false +javac.classpath=\ + ${reference.GeneticaLibraries.jar}:\ + ${reference.Correlation.jar}:\ + ${reference.eQTLMappingPipeline.jar}:\ + ${file.reference.colt.jar}:\ + ${file.reference.jsc.jar}:\ + ${file.reference.jscicore.jar}:\ + ${file.reference.commons-math-2.1.jar} +# Space-separated list of extra javac options +javac.compilerargs= +javac.deprecation=false +javac.processorpath=\ + ${javac.classpath} +javac.source=1.6 +javac.target=1.6 +javac.test.classpath=\ + ${javac.classpath}:\ + ${build.classes.dir} +javac.test.processorpath=\ + ${javac.test.classpath} +javadoc.additionalparam= +javadoc.author=false +javadoc.encoding=${source.encoding} +javadoc.noindex=false +javadoc.nonavbar=false +javadoc.notree=false +javadoc.private=false +javadoc.splitindex=true +javadoc.use=true +javadoc.version=false +javadoc.windowtitle= +main.class=expressiontable.ExpressionTable +manifest.file=manifest.mf +meta.inf.dir=${src.dir}/META-INF +mkdist.disabled=false +platform.active=default_platform +project.Correlation=../Correlation +project.eQTLMappingPipeline=../eqtlmappingpipeline +project.GeneticaLibraries=../GeneticaLibraries +reference.Correlation.jar=${project.Correlation}/dist/Correlation.jar +reference.eQTLMappingPipeline.jar=${project.eQTLMappingPipeline}/dist/eQTLMappingPipeline.jar +reference.GeneticaLibraries.jar=${project.GeneticaLibraries}/dist/GeneticaLibraries.jar +run.classpath=\ + ${javac.classpath}:\ + ${build.classes.dir} +# Space-separated list of JVM arguments used when running the project +# (you may also define separate properties like run-sys-prop.name=value instead of -Dname=value +# or test-sys-prop.name=value to set system properties for unit tests): +run.jvmargs= +run.test.classpath=\ + ${javac.test.classpath}:\ + ${build.test.classes.dir} +source.encoding=UTF-8 +src.dir=src +test.src.dir=test diff --git a/nbproject/project.xml b/nbproject/project.xml new file mode 100644 index 0000000..fe206b8 --- /dev/null +++ b/nbproject/project.xml @@ -0,0 +1,41 @@ + + + org.netbeans.modules.java.j2seproject + + + ExpressionTable + + + + + + + + + + Correlation + jar + + jar + clean + jar + + + GeneticaLibraries + jar + + jar + clean + jar + + + eQTLMappingPipeline + jar + + jar + clean + jar + + + + diff --git a/src/expressiontable/Coexpression.java b/src/expressiontable/Coexpression.java new file mode 100644 index 0000000..2b99d82 --- /dev/null +++ b/src/expressiontable/Coexpression.java @@ -0,0 +1,43 @@ + +package expressiontable; + +import java.io.IOException; +import java.util.HashMap; +import org.apache.commons.math.stat.correlation.SpearmansCorrelation; +import umcg.genetica.io.ExpressionDataset; +import umcg.genetica.io.text.TextFile; +import umcg.genetica.math.matrix.DoubleMatrixDataset; + +/** + * + * @author dashazhernakova + */ +public class Coexpression { + + public void calculateCoexpression(String fname, String out_fname) throws IOException{ + TextFile out = new TextFile(out_fname, true); + DoubleMatrixDataset dataset = new DoubleMatrixDataset(fname); + double[][] rawData = dataset.getRawData(); + dataset.recalculateHashMaps(); + HashMap hashProbes = new HashMap(dataset.hashRows);//probes to indices in rawData + double[] pr1_expr, pr2_expr; + double cor; + + String probe1 = "7_50472431"; + for (String probe2 : hashProbes.keySet()){ + if (! probe1.equals(probe2)){ + pr1_expr = rawData[hashProbes.get(probe1)]; + pr2_expr = rawData[hashProbes.get(probe2)]; + cor = new SpearmansCorrelation().correlation(pr1_expr, pr2_expr); + out.writeln(probe1 + "\t" + probe2 + "\t" + cor); + } + } + //} + out.close(); + } + public static void main(String[] args) throws IOException { + Coexpression c = new Coexpression(); + c.calculateCoexpression("/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/new/deepSAGE_tag/tagwise_expression_table_SNP_in_recognition_sequence_tags_excluded.txt.QuantileNormalized.Log2Transformed.ProbesCentered.SamplesZTransformed.txt.gz", + "/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/new/deepSAGE_tag/coexpression_noPCA_7_50472431"); + } +} diff --git a/src/expressiontable/ExpressionTable.java b/src/expressiontable/ExpressionTable.java new file mode 100644 index 0000000..8cc9f75 --- /dev/null +++ b/src/expressiontable/ExpressionTable.java @@ -0,0 +1,181 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package expressiontable; + +import java.io.IOException; +//import eqtlmappingpipeline.normalization.Normalizer; +/** + * + * @author dashazhernakova + */ +public class ExpressionTable { + + /** + * @param args the command line arguments + */ + public static void usage(){ + System.out.println("--mode\n\t" + + "ProbeToGeneConverter\n\t" + + "getExpressedInAllSamples\n\t" + + "getTopExpressed\n\t" + + "sort\n\t" + + "normalize"); + } + public static void main(String[] args) throws IOException { + String lincRNA = "/Users/dashazhernakova/Documents/UMCG/lincRNA/annotation_lincRNA_hg19_toGenes.txt", + transcr = "/Users/dashazhernakova/Documents/UMCG/hg19/annotation_transcr_hg19.txt"; + Subtable sub = new Subtable(); + Sorter sorter = new Sorter(); + Normalizer norm = new Normalizer(); + //Normalizer norm = new Normalizer(); + + String arg, val, in = null, mode = null, out = null; + + int i = 0; + for (i = 0; i < args.length; i++) { + arg = args[i]; + val = null; + + if (i + 1 < args.length) { + val = args[i + 1]; + } + + if (arg.equals("--mode")) { + mode = val; + //System.out.println("mode"); + break; + } + + } + if (mode == null) { + System.out.println("ERROR: Please supply --mode"); + usage(); + } + else if (mode.equals("ProbeToGeneConverter")){ + String annot = null; + boolean unique = false; + for (int j = i; j < args.length; j++){ + arg = args[j]; + val = null; + + if (j + 1 < args.length) { + val = args[j + 1]; + } + if (arg.equals("--in")) + in = val; + if (arg.equals("--annot")) { + annot = val; + } + if (arg.equals("--out")) + out = val; + if (arg.equals("--unique")) + unique = Boolean.valueOf(val); + } + if (out == null) + out = in.replaceAll("(\\.gz)?(\\.txt)?$", "") + ".genes.txt"; + if ( (in == null ) || (annot == null )) + System.out.println("Not enough arguments!!!"); + System.out.println("Converting to gene ids: \n\texpression table " + in + "\n\tunique " + unique + "\n\tannotation " + annot); + ProbeToGeneConverter converter = new ProbeToGeneConverter(annot); + converter.convertProbesToGenesAvg(in, out, unique); + } + else if (mode.equals("getExpressedInAllSamples")){ + + for (int j = i; j < args.length; j++){ + arg = args[j]; + val = null; + + if (j + 1 < args.length) { + val = args[j + 1]; + } + if (arg.equals("--in")) + in = val; + if (arg.equals("--out")) + out = val; + + } + if (out == null) + out = in.replaceAll("(\\.gz)?(\\.txt)?$", "") + ".expressedInAllSamples.txt"; + System.out.println("\nGetting probes expressed in all samples from " + in); + sub.getExpressedInAllSamples(in, out); + } + else if (mode.equals("getTopExpressed")){ + int n = 0; + for (int j = i; j < args.length; j++){ + arg = args[j]; + val = null; + + if (j + 1 < args.length) { + val = args[j + 1]; + } + if (arg.equals("--in")) + in = val; + if (arg.equals("--out")) + out = val; + if (arg.equals("--n")) + n = Integer.parseInt(val); + } + if (out == null) + out = in.replaceAll("(\\.gz)?(\\.txt)?$", "") + ".top" + n; + System.out.println("Getting top " + n + " expressed genes/transcripts... from " + in); + sub.getMostExpressed(in, out, n); + } + + else if (mode.equals("sort")){ + String by = null; + for (int j = i; j < args.length; j++){ + arg = args[j]; + val = null; + + if (j + 1 < args.length) { + val = args[j + 1]; + } + if (arg.equals("--in")) + in = val; + if (arg.equals("--out")) + out = val; + if (arg.equals("--by")) + by = val; + } + if (out == null) + out = in + ".sorted"; + if (by.equals("name")){ + System.out.println("Sorting " + in + " by probe name..."); + if (out == null) + out = in.replaceAll("(\\.gz)?(\\.txt)?$", "") + ".sortedByName"; + sorter.sortByProbeName(in, out); + + } + else if (by.equals("expression")){ + System.out.println("Sorting " + in + " by average expression..."); + if (out == null) + out = in.replaceAll("(\\.gz)?(\\.txt)?$", "") + ".sortedByExpr"; + sorter.sortByAvgExpression(in, out); + } + else + System.out.println("Wrong \"by\" parameter"); + } + else if (mode.equals("normalize")){ + + for (int j = i; j < args.length; j++){ + arg = args[j]; + val = null; + + if (j + 1 < args.length) { + val = args[j + 1]; + } + if (arg.equals("--in")) + in = val; + } + System.out.println("Normalizing " + in); + norm.normalize(in); + } + else{ + System.out.println("Wrong mode!"); + usage(); + } + + } +} diff --git a/src/expressiontable/Joiner.java b/src/expressiontable/Joiner.java new file mode 100644 index 0000000..f6cd6cb --- /dev/null +++ b/src/expressiontable/Joiner.java @@ -0,0 +1,72 @@ +package expressiontable; + +import java.io.IOException; +import java.util.HashSet; +import umcg.genetica.io.text.TextFile; +import umcg.genetica.math.matrix.DoubleMatrixDataset; + +/** + * + * @author dashazhernakova + */ +public class Joiner { + DoubleMatrixDataset table1; + DoubleMatrixDataset table2; + public Joiner(String f1, String f2) throws IOException{ + table1 = new DoubleMatrixDataset(f1); + table2 = new DoubleMatrixDataset(f2); + } + public void addNewProbes(String outFileName) throws IOException{ + TextFile out = new TextFile(outFileName, true); + HashSet newProbes = new HashSet(); + + //looking for probes from table2 not present in table1 + for (String probe : table2.rowObjects){ + if (! table1.rowObjects.contains(probe)) + newProbes.add(probe); + } + out.close(); + } + + public void appendSamples(String outFileName) throws IOException{ + TextFile out = new TextFile(outFileName, true); + int lineN1 = 0, lineN2 = 0; + //header + for (String id : table1.colObjects) + out.write("\t" + id); + for (String id : table2.colObjects) + out.write("\t" + id); + out.writeln(); + //probes+expression + for (String probe : table1.rowObjects){ + if (table2.rowObjects.contains(probe)){ + out.write(probe); + lineN1 = table1.hashRows.get(probe); + lineN2 = table2.hashRows.get(probe); + for (int i = 0; i < table1.nrCols;i++){ + out.write("\t" + table1.rawData[lineN1][i]); + } + for (int i = 0; i < table2.nrCols;i++){ + out.write("\t" + table2.rawData[lineN2][i]); + } + out.writeln(); + } + + } + + out.close(); + } + + public void merge(){ + + } + public static void main(String[] args) throws IOException { + /*Joiner j = new Joiner("/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/lincRNA_Montgomery/expression_table_all.txt.expressedInAllSamples.txt.200genes.sortedByName.txt.QuantileNormalized.Log2Transformed.txt", + "/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/lincRNA_Yale+Argonne/expression_table_all_yale+argonne.txt.expressedInAllSamples.txt.genes.txt.QuantileNormalized.Log2Transformed.txt"); + + j.appendSamples("/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/lincRNA_Montgomery/Montgomery+Pickrell.genes.txt.QuantileNormalized.Log2Transformed.txt"); + * + */ + System.out.println("tfd/sdfgs/sdfgs.txt/fsdf.gz".replaceAll("(\\.gz)?(\\.txt)?$", "")); + } +} diff --git a/src/expressiontable/Normalizer.java b/src/expressiontable/Normalizer.java new file mode 100644 index 0000000..120b01a --- /dev/null +++ b/src/expressiontable/Normalizer.java @@ -0,0 +1,85 @@ +package expressiontable; + +import java.io.IOException; +import umcg.genetica.io.ExpressionDataset; +import umcg.genetica.math.matrix.DoubleMatrixDataset; +import umcg.genetica.math.stats.Descriptives; +import umcg.genetica.math.stats.Log2Transform; +import umcg.genetica.math.stats.QuantileNormalization; + +/** + * + * @author dashazhernakova + */ +public class Normalizer { + public void normalize(String expressionFile) throws IOException{ + DoubleMatrixDataset dataset = new DoubleMatrixDataset(expressionFile); + double[][] rawData = dataset.getRawData(); + String fileNamePrefix = expressionFile; + + + QuantileNormalization.quantilenormalize(rawData); +// + DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset (dataset.nrRows, dataset.nrCols); + + datasetNormalized.rowObjects = dataset.rowObjects; + datasetNormalized.colObjects = dataset.colObjects; + datasetNormalized.setRawData(rawData); + fileNamePrefix += ".QuantileNormalized"; + datasetNormalized.save(fileNamePrefix + ".txt.gz"); + datasetNormalized = null; + + + Log2Transform.log2transform(rawData); + + datasetNormalized = new DoubleMatrixDataset(dataset.nrRows, dataset.nrCols); + datasetNormalized.rowObjects = dataset.rowObjects; + datasetNormalized.colObjects = dataset.colObjects; + datasetNormalized.setRawData(rawData); + fileNamePrefix += ".Log2Transformed"; + datasetNormalized.save(fileNamePrefix + ".txt.gz"); + datasetNormalized = null; + + System.out.println("Standardizing probe mean and standard deviation"); + for (int p = 0; p < dataset.nrRows; p++) { + double mean = Descriptives.mean(rawData[p]); + double stdev = Math.sqrt(Descriptives.variance(rawData[p], mean)); + for (int s = 0; s < dataset.nrCols; s++) { + rawData[p][s] -= mean; + } + } + + dataset.setRawData(rawData); + fileNamePrefix += ".ProbesCentered"; + dataset.save(fileNamePrefix + ".txt.gz"); + + System.out.println("- Standardizing sample mean and standard deviation"); + for (int s = 0; s < dataset.nrCols; s++) { + double[] vals = new double[dataset.nrRows]; + for (int p = 0; p < dataset.nrRows; p++) { + vals[p] = dataset.getRawData()[p][s]; + } + double mean = Descriptives.mean(vals); + for (int p = 0; p < dataset.nrRows; p++) { + vals[p] -= mean; + } + double var = Descriptives.variance(vals, mean); + double stdev = Math.sqrt(var); + for (int p = 0; p < dataset.nrRows; p++) { + dataset.getRawData()[p][s] = (vals[p] / stdev); + } + } + + datasetNormalized = new DoubleMatrixDataset(dataset.nrRows, dataset.nrCols); + datasetNormalized.rowObjects = dataset.rowObjects; + datasetNormalized.colObjects = dataset.colObjects; + datasetNormalized.setRawData(rawData); + fileNamePrefix += ".SamplesZTransformed"; + datasetNormalized.save(fileNamePrefix + ".txt.gz"); + datasetNormalized = null; + } + public static void main(String[] args) throws IOException { + Normalizer n = new Normalizer(); + n.normalize("/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/lincRNA_Yale+Argonne/expression_table_all_yale+argonne.txt.expressedInAllSamples.txt.genes.txt"); + } +} diff --git a/src/expressiontable/ProbeToGeneConverter.java b/src/expressiontable/ProbeToGeneConverter.java new file mode 100644 index 0000000..13980aa --- /dev/null +++ b/src/expressiontable/ProbeToGeneConverter.java @@ -0,0 +1,144 @@ +package expressiontable; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map.Entry; +import java.util.TreeMap; +import umcg.genetica.io.text.TextFile; + +/** + * + * @author dashazhernakova + */ +public class ProbeToGeneConverter { + HashMap probe2genes; + public ProbeToGeneConverter(String annotationFile) throws IOException{ + TextFile annotation = new TextFile(annotationFile, false); + probe2genes = new HashMap(); + String[] els = annotation.readLineElems(TextFile.tab); + while ((els = annotation.readLineElems(TextFile.tab)) != null) + probe2genes.put(els[1], els[2]); + annotation.close(); + + } + + public ProbeToGeneConverter(){} + + /* + * Converts one gene ids X to gene ids Y + * fname - path to the expression table + * conversionFname - path to the file of the type X \t Y + */ + public void convertGeneIdsToGeneNames(String fname, String outFname, String conversionFname) throws IOException{ + HashMap conversion = new HashMap(); + TextFile conv = new TextFile(conversionFname, false); + String [] els; + while ((els = conv.readLineElems(TextFile.tab)) != null){ + conversion.put(els[0], els[1]); + } + conv.close(); + + TextFile table = new TextFile(fname, false); + TextFile out = new TextFile(outFname, true); + out.writeln(table.readLine()); + String line; + int pos = 0, neg = 0; + while ((line = table.readLine()) != null){ + els = line.split("\t"); + if (conversion.containsKey(els[0])){ + els[0] = conversion.get(els[0]); + out.writelnTabDelimited(els); + pos++; + } + else + neg++; + } + System.out.println("Successfully converted " + pos + " genes\nNo alternative name found for " + neg + " genes."); + table.close(); + out.close(); + } + + /* + * writes expression values averaged over all isoforms of a gene + * fname - expression table + * unique - write only genes with one isoform + */ + public void convertProbesToGenesAvg(String fname, String outFname, boolean unique) throws IOException{ + TextFile expr = new TextFile(fname, false); + + TextFile out = new TextFile(outFname, true); + String[] spl; + String line= expr.readLine(), probe = null, gene = null; + out.writeln(line); + TreeMap> gene2lines = new TreeMap>(); + TreeMap gene2avg = new TreeMap(); + System.out.println("Converting only single isoform genes? " + unique); + int numProbes = 0; + while ((line = expr.readLine()) != null){ + spl = line.split("\t"); + probe = spl[0]; + numProbes ++; + if (probe2genes.containsKey(probe)){ + gene = probe2genes.get(probe); + + ArrayList lines = new ArrayList(); + if (gene2lines.containsKey(gene)) { + gene2lines.get(gene).add(line); + } + else{ + lines.add(line); + gene2lines.put(gene, lines); + } + } + } + System.out.println("Overall number of probes processed: " + numProbes); + System.out.println("Overall number of resulting genes: " + gene2lines.keySet().size()); + + //Averaging and writing to file + int size; + String[] splLine; + float[] sum; + String avg; // average gene expression for each sample + for (Entry > e : gene2lines.entrySet()){ + ArrayList lines = e.getValue(); + gene = e.getKey(); + size = lines.size(); + sum = new float[lines.get(0).split("\t").length]; + avg = ""; + if ((size > 1) && (! unique)){ //if more than one isoform for this gene + for (String s : lines){ + splLine = s.split("\t"); + for (int i = 1; i < splLine.length; i++) + sum[i]+=Float.parseFloat(splLine[i]); //summing + } + out.write(gene); + for (int i = 1; i < sum.length; i++){ + out.write("\t" + sum[i]/size); //averaging over isoform expression values for current sample + } + out.writeln(); + //gene2avg.put(gene, avg); + } + else if (size == 1){ //if one isoform + out.write(gene);//average gene expression = isoform expression + splLine = lines.get(0).split("\t"); + for (int i = 1; i < splLine.length; i++) + out.write("\t" + splLine[i]); + out.writeln(); + } + } + expr.close(); + out.close(); + } + public static void main(String[] args) throws IOException { + /*ProbeToGeneConverter c = new ProbeToGeneConverter(); + + c.convertGeneIdsToGeneNames("/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/new/Pickrell/genes/expression_table.Pickrell.genes.txt.gz.QuantileNormalized.Log2Transformed.txt", + "/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/new/Pickrell/genes/expression_table.Pickrell.geneNames.txt.gz.QuantileNormalized.Log2Transformed.txt", + "/Users/dashazhernakova/Documents/UMCG/hg19/Ids_conversion/Ensembl_v69_geneId2gene.txt"); + */ + ProbeToGeneConverter c = new ProbeToGeneConverter("/Users/dashazhernakova/Documents/UMCG/hg19/annotations/annotation_tag_hg19.txt"); + c.convertProbesToGenesAvg("/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/new/deepSAGE_tag/randomSubsets/45samples/expression_table.deepSAGE_tag.45samples.1.txt.gz.QuantileNormalized.Log2Transformed.txt.gz", + "/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/new/deepSAGE_tag/randomSubsets/45samples/expression_table.deepSAGE_tag.45samples.1.txt.gz.QuantileNormalized.Log2Transformed_genes.txt.gz", false); + } +} diff --git a/src/expressiontable/Sorter.java b/src/expressiontable/Sorter.java new file mode 100644 index 0000000..3590d02 --- /dev/null +++ b/src/expressiontable/Sorter.java @@ -0,0 +1,100 @@ +package expressiontable; + +import java.io.IOException; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; +import java.util.TreeMap; +import umcg.genetica.io.text.TextFile; +import umcg.genetica.math.matrix.DoubleMatrixDataset; + +/** + * + * @author dashazhernakova + */ +public class Sorter { + public double calculateAvg(double[] array){ + double avg = 0; + for (int i = 0 ; i < array.length; i++) + avg += array[i]; + avg /= array.length; + return avg; + } + + public void sortByAvgExpression(String fname, String outFname) throws IOException{ + TextFile out = new TextFile(outFname, true); + + //reading the expression table + DoubleMatrixDataset dataset = new DoubleMatrixDataset(fname); + dataset.recalculateHashMaps(); + HashMap hashProbes = new HashMap(dataset.hashRows);//probes to indices in rawData + + out.write("\t"); + out.writelnTabDelimited(dataset.colObjects.toArray()); + + double[] line = null; + int lineNum = 0; + HashMap probeNumToAvg = new HashMap(); //probe indices in rawData to avg expresion + ValueComparator bvc = new ValueComparator(probeNumToAvg); //to sort by value (avg expression) rather than by key + TreeMap sorted_probeNumToAvg = new TreeMap(bvc); //probeNumToAvg sorted by avg expression + + for ( Entry e : hashProbes.entrySet()){ + lineNum = e.getValue(); //probe index in rawData + line = dataset.getRawData()[lineNum]; //probe expression + probeNumToAvg.put(lineNum, calculateAvg(line)); + } + sorted_probeNumToAvg.putAll(probeNumToAvg); + for (Entry e : sorted_probeNumToAvg.entrySet()){ + lineNum = e.getKey(); + line = dataset.getRawData()[lineNum]; + out.write(dataset.rowObjects.get(lineNum)); + for (int i = 0; i < line.length; i++) + out.write("\t" + line[i]); + out.writeln(); + } + out.close(); + } + + public void sortByProbeName(String fname, String outFname) throws IOException{ + TextFile in = new TextFile(fname, false); + TextFile out = new TextFile(outFname, true); + + String line = in.readLine(), probe; + out.write("\t"); + out.writeln(line); + + TreeMap probe2expr = new TreeMap(); + + while ( (line = in.readLine()) != null){ + probe = line.split("\t")[0]; + probe2expr.put(probe, line); + } + + for (String pr : probe2expr.keySet()){ + out.writeln(pr + "\t" + probe2expr.get(pr)); + } + in.close(); + out.close(); + } + + public class ValueComparator implements Comparator { + + Map base; + public ValueComparator(Map base) { + this.base = base; + } + + + @Override + public int compare(Integer a, Integer b) { + return base.get(b).compareTo(base.get(a)); + } + } + public static void main(String[] args) throws IOException { + Sorter s = new Sorter(); + s.sortByAvgExpression("/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/lincRNA_Sebo/expression_table_normByGeneLength.txt", + "/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/lincRNA_Sebo/expression_table_normByGeneLength_sorted.txt"); + } +} + diff --git a/src/expressiontable/Subtable.java b/src/expressiontable/Subtable.java new file mode 100644 index 0000000..eeec7ff --- /dev/null +++ b/src/expressiontable/Subtable.java @@ -0,0 +1,224 @@ +package expressiontable; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map.Entry; +import java.util.TreeMap; +import umcg.genetica.io.ExpressionDataset; +import umcg.genetica.io.text.TextFile; +import umcg.genetica.math.matrix.DoubleMatrixDataset; + + +/** + * + * @author dashazhernakova + */ +public class Subtable { + + public double calculateAvg(double[] array){ + double avg = 0; + for (int i = 0 ; i < array.length; i++) + avg += array[i]; + avg /= array.length; + return avg; + } + public boolean isExpressedInAllSamples(double[] array){ + for (int i = 0 ; i < array.length; i++){ + if (array[i] == 0) + return false; + } + return true; + } + public boolean isExpressedInNSamples(double[] array, int N){ + int n = 0; + for (int i = 0 ; i < array.length; i++){ + if (array[i] > 0) + n++; + } + if (n >= N) + return true; + return false; + } + public boolean avgExpressionHigherThanThreshold(double[] array, double threshold){ + double avg = 0; + for (int i = 0 ; i < array.length; i++) + avg += array[i]; + avg /= array.length; + if (avg < threshold) + return false; + return true; + } + + public void getRandomSubsetOfSamples(String fname, int n, String outFname) throws IOException{ + //reading the expression table + DoubleMatrixDataset dataset = new DoubleMatrixDataset(fname); + dataset.recalculateHashMaps(); + HashMap hashSamples = new HashMap(dataset.hashCols);//samples to indices in rawData + + List samples = dataset.colObjects; + HashSet samplesToInclude = new HashSet(n); + + Collections.shuffle(samples); + samplesToInclude.addAll(samples.subList(0, n)); + for (String s : samplesToInclude) + System.out.println(s); + dataset = new DoubleMatrixDataset (fname, new HashSet(dataset.rowObjects), samplesToInclude); + + dataset.recalculateHashMaps(); + dataset.save(outFname); + + + } + public void getAvgExpression(String fname, String outFname) throws IOException{ + TextFile out = new TextFile(outFname, true); + + //reading the expression table + DoubleMatrixDataset dataset = new DoubleMatrixDataset(fname); + dataset.recalculateHashMaps(); + HashMap hashProbes = new HashMap(dataset.hashRows);//probes to indices in rawData + + //out.writeln("gene\tavg"); + String probe = ""; + double[] line = null; + int lineNum = 0; + for ( Entry e : hashProbes.entrySet()){ + lineNum = e.getValue(); //probe index in rawData + probe = dataset.rowObjects.get(lineNum); + line = dataset.getRawData()[lineNum]; //probe expression + out.writeln(probe + "\t" + calculateAvg(line)); + } + out.close(); + } + + /** + * Gets top N most expressed probes (N specified by numProbes) + * @param fname + * @param outFname + * @param numProbes + * @throws IOException + */ + public void getMostExpressed(String fname, String outFname, int numProbes) throws IOException{ + TextFile out = new TextFile(outFname, true); + + //reading the expression table + DoubleMatrixDataset dataset = new DoubleMatrixDataset(fname); + dataset.recalculateHashMaps(); + HashMap hashProbes = new HashMap(dataset.hashRows);//probes to indices in rawData + + out.write("\t"); + out.writelnTabDelimited(dataset.colObjects.toArray()); + + double[] line = null; + int lineNum = 0; + HashMap probeNumToAvg = new HashMap(); //probe indices in rawData to avg expresion + Sorter s = new Sorter(); + Sorter.ValueComparator bvc = s.new ValueComparator(probeNumToAvg); //to sort by value (avg expression) rather than by key + TreeMap sorted_probeNumToAvg = new TreeMap(bvc); //probeNumToAvg sorted by avg expression + + for ( Entry e : hashProbes.entrySet()){ + lineNum = e.getValue(); //probe index in rawData + line = dataset.getRawData()[lineNum]; //probe expression + probeNumToAvg.put(lineNum, calculateAvg(line)); + } + sorted_probeNumToAvg.putAll(probeNumToAvg); + for (Entry e : sorted_probeNumToAvg.entrySet()){ + lineNum = e.getKey(); + if (lineNum < numProbes){ + line = dataset.getRawData()[lineNum]; + out.write(dataset.rowObjects.get(lineNum)); + for (int i = 0; i < line.length; i++) + out.write("\t" + line[i]); + out.writeln(); + } + } + out.close(); + } + + + public void getExpressedInAllSamples(String fname, String outFname) throws IOException{ + TextFile out = new TextFile(outFname, true); + + DoubleMatrixDataset dataset = new DoubleMatrixDataset(fname); + dataset.recalculateHashMaps(); + HashMap hashProbes = new HashMap(dataset.hashRows);//probes to indices in rawData + + out.write("\t"); + out.writelnTabDelimited(dataset.colObjects.toArray()); + + int counter = 0; + double[] line = null; + for ( Entry e : hashProbes.entrySet()){ + line = dataset.getRawData()[e.getValue()]; + if (isExpressedInAllSamples(line)){ + counter ++; + out.write(e.getKey()); + for (int i = 0; i < line.length; i++) + out.write("\t" + line[i]); + out.writeln(); + } + } + System.out.println("Number of probes expressed in all samples: " + counter); + out.close(); + } + + /** + * Writes all probes expressed in at least "percent" % samples + * @param fname + * @param outFname + * @param percent + * @throws IOException + */ + public void getExpressedInNSamples(String fname, String outFname, int percent) throws IOException{ + TextFile out = new TextFile(outFname, true); + + DoubleMatrixDataset dataset = new DoubleMatrixDataset(fname); + dataset.recalculateHashMaps(); + HashMap hashProbes = new HashMap(dataset.hashRows);//probes to indices in rawData + + out.write("\t"); + out.writelnTabDelimited(dataset.colObjects.toArray()); + + int minSamplesExpressed = dataset.nrCols*percent/100; + int counter = 0; + double[] line = null; + for ( Entry e : hashProbes.entrySet()){ + line = dataset.getRawData()[e.getValue()]; + if (isExpressedInNSamples(line, minSamplesExpressed)){ + counter ++; + out.write(e.getKey()); + for (int i = 0; i < line.length; i++) + out.write("\t" + line[i]); + out.writeln(); + } + } + System.out.println("Number of probes expressed in " + percent + " % of samples: " + counter); + out.close(); + } + + public static void main(String[] args) throws IOException { + Subtable c = new Subtable(); + /*c.getExpressedInAllSamples("/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/tmp.txt", + "/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/tmp2.txt"); + Sorter s = new Sorter(); + s.sortByAvgExpression("/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/tmp.txt", + "/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/tmp2.txt"); + * + */ + //c.getExpressedInAllSamples("/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/Yale+Argonne/yale_argonne_expression_nonnorm_NONZERO.txt", + // "/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/Yale+Argonne/yale_argonne_expression_nonnorm_NONZERO.txt.expressedInAllSamples"); + + //c.getAvgExpression("/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/new/deepSAGE_tag/tagwise_expression_table_SNP_in_recognition_sequence_tags_excluded.txt", + // "/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/new/deepSAGE_tag/avgExpression.txt"); + c.getRandomSubsetOfSamples("/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/new/deepSAGE_tag/tagwise_expression_table_SNP_in_recognition_sequence_tags_excluded.txt", + 40, + "/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/tmp.txt"); + //"/Users/dashazhernakova/Documents/UMCG/GeneticalGenomicsDatasets/new/deepSAGE_tag/randomSubsets/55samples/expression_table.deepSAGE_tag.55samples.4.txt.gz"); + } + + +}