From d459b0afef936d557a1bbcd3bca6bde3d2b9cc39 Mon Sep 17 00:00:00 2001 From: Paul Pavlidis Date: Wed, 12 Apr 2023 12:50:00 -0700 Subject: [PATCH] add convenience methods for finding rows by name This is useful for debugging. I only added one test. Relevant to https://github.com/PavlidisLab/GemmaCuration/issues/218 so ExpressionDataDoubleMatrixUtil is touched --- .../matrix/BaseExpressionDataMatrix.java | 19 ++++++++++++++----- .../matrix/EmptyExpressionMatrix.java | 5 +++++ .../matrix/ExpressionDataBooleanMatrix.java | 12 ++++++++++++ .../matrix/ExpressionDataDoubleMatrix.java | 13 +++++++++++++ .../ExpressionDataDoubleMatrixUtil.java | 10 +++++++++- .../matrix/ExpressionDataIntegerMatrix.java | 13 +++++++++++++ .../matrix/ExpressionDataMatrix.java | 17 +++++++++++++++++ .../matrix/ExpressionDataStringMatrix.java | 14 ++++++++++++++ .../ExpressionDataDoubleMatrixTest.java | 2 ++ 9 files changed, 99 insertions(+), 6 deletions(-) diff --git a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/BaseExpressionDataMatrix.java b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/BaseExpressionDataMatrix.java index 053080ec94..39648a50b2 100644 --- a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/BaseExpressionDataMatrix.java +++ b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/BaseExpressionDataMatrix.java @@ -34,6 +34,7 @@ import javax.annotation.Nullable; import java.io.Serializable; import java.util.*; +import java.util.stream.Collectors; /** * Base class for ExpressionDataMatrix implementations. @@ -129,6 +130,14 @@ public BioAssayDimension getBestBioAssayDimension() { } + @Override + public Collection findRowsByName( String name ) { + return rowElementMap.entrySet().stream() + .filter( entry -> entry.getKey().getName().equals( name ) ) + .map( entry -> entry.getValue() ) + .collect( Collectors.toList() ); + } + @Override public BioAssayDimension getBioAssayDimension( CompositeSequence designElement ) { @@ -258,7 +267,7 @@ void addToRowMaps( Integer row, CompositeSequence designElement ) { * For example, in the following diagram "-" indicates a biomaterial, while "*" indicates a bioassay. Each row of * "*" indicates samples run on a different microarray design (a different bio assay material). In the examples we * assume there is just a single biomaterial dimension. - * + * *
      * ---------------
      * *****              -- only a few samples run on this platform
@@ -268,7 +277,7 @@ void addToRowMaps( Integer row, CompositeSequence designElement ) {
      * 

* A simpler case: *

- * + * *
      * ---------------
      * ***************
@@ -278,7 +287,7 @@ void addToRowMaps( Integer row, CompositeSequence designElement ) {
      * 

* A more typical and easy case (one microarray design used): *

- * + * *
      * ----------------
      * ****************
@@ -286,7 +295,7 @@ void addToRowMaps( Integer row, CompositeSequence designElement ) {
      * 

* If every sample was run on two different array designs: *

- * + * *
      * ----------------
      * ****************
@@ -294,7 +303,7 @@ void addToRowMaps( Integer row, CompositeSequence designElement ) {
      * 
*

* Every sample was run on a different array design: - * + * *

      * -----------------------
      * ******
diff --git a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/EmptyExpressionMatrix.java b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/EmptyExpressionMatrix.java
index 610ea8a17f..b7f2da4aae 100644
--- a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/EmptyExpressionMatrix.java
+++ b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/EmptyExpressionMatrix.java
@@ -83,6 +83,11 @@ public Object[] getColumn( BioAssay bioAssay ) {
         throw new UnsupportedOperationException();
     }
 
+    @Override
+    public Object[][] getRows( Collection indices ) {
+        throw new UnsupportedOperationException();
+    }
+
     @Override
     public Object[] getColumn( Integer column ) {
         throw new UnsupportedOperationException();
diff --git a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataBooleanMatrix.java b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataBooleanMatrix.java
index 6170f51b49..2f25dadad1 100644
--- a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataBooleanMatrix.java
+++ b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataBooleanMatrix.java
@@ -62,6 +62,18 @@ public ExpressionDataBooleanMatrix( Collection indices ) {
+        if ( indices == null || indices.isEmpty() ) {
+            return null;
+        }
+
+        return indices.stream()
+                .map( index -> getRow( index ) )
+                .toArray( Boolean[][]::new );
+
+    }
+
     @Override
     public int columns() {
         return matrix.columns();
diff --git a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataDoubleMatrix.java b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataDoubleMatrix.java
index 4348066e17..c82b67d53f 100644
--- a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataDoubleMatrix.java
+++ b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataDoubleMatrix.java
@@ -356,6 +356,19 @@ public Double[] getRow( Integer index ) {
         return ArrayUtils.toObject( rawRow );
     }
 
+
+    @Override
+    public Double[][] getRows( Collection indices ) {
+        if ( indices == null || indices.isEmpty() ) {
+            return null;
+        }
+
+        return indices.stream()
+                .map( index -> getRow( index ) )
+                .toArray( Double[][]::new );
+
+    }
+
     @Override
     public Double[][] getRows( List designElements ) {
         if ( designElements == null ) {
diff --git a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataDoubleMatrixUtil.java b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataDoubleMatrixUtil.java
index 15c84e18c3..f56cf31923 100644
--- a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataDoubleMatrixUtil.java
+++ b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataDoubleMatrixUtil.java
@@ -94,8 +94,16 @@ public static ExpressionDataDoubleMatrix filterAndLog2Transform( ExpressionDataD
             if ( dmatrix.rows() < r ) {
                 ExpressionDataDoubleMatrixUtil.log.info( ( r - dmatrix.rows() ) + " rows removed due to too many identical values" );
             }
-        }
 
+            /*
+             * As noted in https://github.com/PavlidisLab/GemmaCuration/issues/218, this filter can still do things
+             * that are counter-intuitive because we are using quantile-normalization. In some cases very highly expressed genes
+             * will be assigned multiple identical values, though they weren't identical in the raw data.
+             *
+             * Fixing this "once and for all" might mean making the filter less stringent at very high expression levels (like the top 1% of genes)
+             * or changing the approach.
+             */
+        }
         return dmatrix;
 
     }
diff --git a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataIntegerMatrix.java b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataIntegerMatrix.java
index 6386accfb6..43e44d4f06 100644
--- a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataIntegerMatrix.java
+++ b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataIntegerMatrix.java
@@ -62,6 +62,19 @@ public Integer[][] get( List designElements, List b
         throw new UnsupportedOperationException();
     }
 
+
+    @Override
+    public Integer[][] getRows( Collection indices ) {
+        if ( indices == null || indices.isEmpty() ) {
+            return null;
+        }
+
+        return indices.stream()
+                .map( index -> getRow( index ) )
+                .toArray( Integer[][]::new );
+
+    }
+
     @Override
     public Integer[] getColumn( BioAssay bioAssay ) {
         int index = this.columnAssayMap.get( bioAssay );
diff --git a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataMatrix.java b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataMatrix.java
index 37495f18f1..93ff439dd1 100644
--- a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataMatrix.java
+++ b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataMatrix.java
@@ -220,6 +220,23 @@ public interface ExpressionDataMatrix {
      */
     T[][] getRows( List designElements );
 
+    /**
+     * Access a submatrix
+     * @param rowIndices of integers to select
+     * @return T[][] or null if rowIndices is null or empty.
+     */
+    T[][] getRows( Collection rowIndices);
+
+
+    /**
+     * Convenience function to locate the indices with an (exact match) CompositeSequence.name
+     *
+     * @param name The CompositeSequence name to look for
+     * @return array of row indices matching (usually will just be one value)
+     */
+    Collection findRowsByName(String name);
+
+
     /**
      * @return true if any values are null or NaN (for Doubles); all other values are considered non-missing.
      */
diff --git a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataStringMatrix.java b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataStringMatrix.java
index 16725f5000..d5f1fe3008 100644
--- a/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataStringMatrix.java
+++ b/gemma-core/src/main/java/ubic/gemma/core/datastructure/matrix/ExpressionDataStringMatrix.java
@@ -87,6 +87,20 @@ public String[][] get( List designElements, List bi
         throw new UnsupportedOperationException();
     }
 
+
+    @Override
+    public String[][] getRows( Collection indices ) {
+        if ( indices == null || indices.isEmpty() ) {
+            return null;
+        }
+
+        return indices.stream()
+                .map( index -> getRow( index ) )
+                .toArray( String[][]::new );
+
+    }
+
+
     @Override
     public String[] getColumn( BioAssay bioAssay ) {
         int index = this.columnAssayMap.get( bioAssay );
diff --git a/gemma-core/src/test/java/ubic/gemma/core/datastructure/matrix/ExpressionDataDoubleMatrixTest.java b/gemma-core/src/test/java/ubic/gemma/core/datastructure/matrix/ExpressionDataDoubleMatrixTest.java
index 0f1ab27fb1..f5639d304f 100644
--- a/gemma-core/src/test/java/ubic/gemma/core/datastructure/matrix/ExpressionDataDoubleMatrixTest.java
+++ b/gemma-core/src/test/java/ubic/gemma/core/datastructure/matrix/ExpressionDataDoubleMatrixTest.java
@@ -170,6 +170,8 @@ public void testConstructExpressionDataDoubleMatrix() {
             log.debug( aRow );
         }
 
+        assertEquals(2, expressionDataDoubleMatrix.getRows( Arrays.asList( new Integer[]{1,2} )).length );
+
         Double[][] dMatrix = expressionDataDoubleMatrix.getRawMatrix();
         assertEquals( dMatrix.length, 200 );
         assertEquals( dMatrix[0].length, 59 );