add Aparapi + add NetworkSpiderGPU

spbsu-ml-community · Feb 26, 2018 · 0881c1b · 0881c1b
1 parent 5af206d
commit 0881c1b
Show file tree

Hide file tree

Showing 16 changed files with 951 additions and 29 deletions.
diff --git a/cuda/pom.xml b/cuda/pom.xml
@@ -76,6 +76,11 @@
       <artifactId>xchart</artifactId>
       <version>2.5.1</version>
     </dependency>
+    <dependency>
+      <groupId>com.aparapi</groupId>
+      <artifactId>aparapi</artifactId>
+      <version>1.4.1</version>
+    </dependency>
   </dependencies>
 
   <build>

diff --git a/cuda/src/main/java/com/expleague/cuda/AparapiOperations.java b/cuda/src/main/java/com/expleague/cuda/AparapiOperations.java
@@ -0,0 +1,243 @@
+package com.expleague.cuda;
+
+import com.aparapi.Kernel;
+import com.aparapi.Range;
+import com.aparapi.device.Device;
+import com.aparapi.internal.kernel.KernelManager;
+import com.expleague.commons.math.vectors.Mx;
+import com.expleague.commons.math.vectors.impl.mx.VecBasedMx;
+import com.expleague.commons.math.vectors.impl.vectors.ArrayVec;
+
+/**
+ * Created by hrundelb on 20.11.17.
+ */
+public class AparapiOperations {
+
+  public static float[] sum(final float[] left, final float[] right) {
+    assert (left.length == right.length);
+    final float[] result = new float[left.length];
+
+    Kernel kernel = new Kernel() {
+      @Override
+      public void run() {
+        int i = getGlobalId();
+        result[i] = left[i] + right[i];
+      }
+    };
+
+    Range range = Range.create(result.length);
+    kernel.execute(range);
+    return result;
+  }
+
+  public static void multiplyTo(float[] left, float[] right, float[] result, int size) {
+
+    final int blockDim = 16;
+
+    Kernel kernel = new Kernel() {
+
+      @Override
+      public void run() {
+        int i = getGlobalId(0);
+        int j = getGlobalId(1);
+
+        float value = 0;
+        for (int k = 0; k < size; k++) {
+          value += left[i * size + k] * right[k * size + j];
+        }
+        result[i * size + j] = value;
+      }
+    };
+
+    Device device = KernelManager.instance().bestDevice();
+    System.out.println("Run on: " + device.getShortDescription());
+    Range range = device.createRange2D(size, size, blockDim, blockDim);
+    kernel.execute(range);
+  }
+
+  public static void multiplyTo(Mx left, Mx right, Mx result) {
+    if(left.columns() != right.rows())
+      throw new IllegalArgumentException();
+
+    final int rows = left.rows();
+    final int cols1 = left.columns();
+    final int cols2 = right.columns();
+
+    double[] arrayLeft = ((ArrayVec) ((VecBasedMx) left).vec).data.array;
+    double[] arrayRight = ((ArrayVec) ((VecBasedMx) right).vec).data.array;
+    double[] arrayResult = ((ArrayVec) ((VecBasedMx) result).vec).data.array;
+
+    Kernel kernel = new Kernel() {
+      @Override
+      public void run() {
+        int i = getGlobalId() / cols2;
+        int j = getGlobalId() % cols2;
+
+        double value = 0;
+        for (int k = 0; k < cols1; k++) {
+          value += arrayLeft[i * cols1 + k] * arrayRight[k * cols2 + j];
+        }
+        arrayResult[i * cols2 + j] = value;
+      }
+    };
+
+    Device device = KernelManager.instance().bestDevice();
+    System.out.println("Run on: " + device.getShortDescription());
+    Range range = device.createRange(rows * cols2);
+    kernel.execute(range);
+  }
+
+
+
+  public static void transpose(float[] left, float[] result, int size) {
+
+    final int blockDim = 32;
+
+    Kernel kernel = new Kernel() {
+
+      @Local
+      float[] temp = new float[blockDim * blockDim];
+
+      @Override
+      public void run() {
+        int xIndex = getGlobalId(0);
+        int yIndex = getGlobalId(1);
+
+        if (xIndex < size && yIndex < size) {
+          int inputIdx = xIndex * size + yIndex;
+          temp[getLocalId(0) * blockDim + getLocalId(1)] = left[inputIdx];
+        }
+
+        localBarrier();
+
+        if (xIndex < size && yIndex < size) {
+          int outputIndex = yIndex * size + xIndex;
+          result[outputIndex] = temp[getLocalId(0) * blockDim + getLocalId(1)];
+        }
+      }
+    };
+
+    Device device = KernelManager.instance().bestDevice();
+    System.out.println("Run on: " + device.getShortDescription());
+    Range range = device.createRange2D(size, size, blockDim, blockDim);
+    kernel.execute(range);
+  }
+
+
+  public static void matrixExp(final float[] matrix, final float[] result, int rows, int passes) {
+
+    Kernel kernel = new Kernel() {
+      @Override
+      public void run() {
+        int i = getGlobalId();
+        float sum = 0;
+        for (int j = 0; j < rows - 1; j++) {
+          float e = (float) Math.exp(matrix[i * (rows - 1) + j]);
+          sum += e;
+          result[i * rows + j] = e;
+        }
+        result[i * rows + rows - 1] = 1;
+        sum += 1;
+        for (int j = 0; j < rows; j++) {
+          result[i * rows + j] = result[i * rows + j] / sum;
+        }
+      }
+    };
+
+    Device device = KernelManager.instance().bestDevice();
+    System.out.println("Run on: " + device.getShortDescription());
+    Range range = device.createRange(rows);
+    kernel.execute(range, passes);
+  }
+
+
+  public static float[] vectorReduce(final float[] arguments) {
+
+    final int blockSize = 32;
+    final int n = arguments.length;
+    final float[] results = new float[n];
+
+    Kernel kernel = new Kernel() {
+
+      @Local
+      float[] sdata = new float[blockSize];
+
+      @Override
+      public void run() {
+        final int tid = getLocalId();
+        final int i = getGlobalId();
+
+        if (i < n) {
+          sdata[tid] = arguments[i];
+        } else {
+          //sdata[tid] = 1f;
+        }
+
+        for (int s = blockSize / 2; s > 0; s >>= 1) {
+          if (tid < s) {
+            sdata[tid] += sdata[tid + s];
+          }
+          localBarrier();
+        }
+
+        //if (tid == 0) {
+          results[i] = sdata[tid];
+        //}
+      }
+    };
+
+    Device device = KernelManager.instance().bestDevice();
+    System.out.println("Run on: " + device.getShortDescription());
+    final int globalWidth = (int) Math.ceil((double) n / blockSize) * blockSize;
+    Range range = device.createRange(globalWidth, blockSize);
+    kernel.execute(range);
+    return results;
+  }
+
+
+  public static void matrixExpReduce(final float[] matrix, final float[] result, int rows, int
+      passes) {
+
+    final int blockSize = rows;
+
+    Kernel kernel = new Kernel() {
+      @Local
+      float[] sdata = new float[blockSize];
+
+      @Local
+      float[] res = new float[blockSize];
+
+      @Override
+      public void run() {
+        final int i = getGlobalId();
+        final int tid = getLocalId();
+        final int blockId = getGroupId();
+
+        if (tid < rows - 1) {
+          sdata[tid] = exp(matrix[blockId * (rows - 1) + tid]);
+          res[tid] = sdata[tid];
+        } else {
+          sdata[tid] = 1;
+          res[tid] = 1;
+        }
+
+        localBarrier();
+
+        for (int s = blockSize / 2; s > 0; s >>= 1) {
+          if (tid < s) {
+            sdata[tid] += sdata[tid + s];
+          }
+          localBarrier();
+        }
+
+        result[i] = res[tid] / sdata[0];
+      }
+    };
+
+    Device device = KernelManager.instance().bestDevice();
+    System.out.println("Run on: " + device.getShortDescription());
+    Range range = device.createRange(rows * rows, blockSize);
+    kernel.execute(range, passes);
+  }
+
+}
diff --git a/cuda/src/main/java/com/expleague/cuda/JCudaMemory.java b/cuda/src/main/java/com/expleague/cuda/JCudaMemory.java
@@ -62,6 +62,10 @@ public static CUdeviceptr copy(final double[] data, final @NotNull CUdeviceptr d
     return copy(Pointer.to(data), devicePointer, data.length * Sizeof.DOUBLE);
   }
 
+  public static CUdeviceptr copy(int[] data, CUdeviceptr devicePointer) {
+    return copy(Pointer.to(data), devicePointer, data.length * Sizeof.INT);
+  }
+
   public static CUdeviceptr copy(
       final @NotNull Pointer hostPointer,
       final @NotNull CUdeviceptr devicePointer,

diff --git a/cuda/src/main/java/com/expleague/cuda/KernelOperations.java b/cuda/src/main/java/com/expleague/cuda/KernelOperations.java
@@ -3,6 +3,7 @@
 import com.expleague.cuda.data.GPUMx;
 import com.expleague.cuda.data.GPUVec;
 import jcuda.Pointer;
+import jcuda.Sizeof;
 import jcuda.driver.CUfunction;
 import jcuda.driver.JCudaDriver;
 
@@ -37,7 +38,7 @@ public static void fMatrixReduce(final GPUMx matrix, final GPUMx result) {
         Pointer.to(matrix.gpuVec.devicePointer),
         Pointer.to(result.gpuVec.devicePointer));
 
-    final int blockDim = 6;
+    final int blockDim = result.rows;
 
     JCudaDriver.cuLaunchKernel(F_MATRIX_REDUCE,
         blockDim, 1, 1,
@@ -48,6 +49,29 @@ public static void fMatrixReduce(final GPUMx matrix, final GPUMx result) {
     JCudaDriver.cuCtxSynchronize();
   }
 
+
+  private static final CUfunction REDUCE5 =
+      JCudaHelper.getFunction(CU_FILE_PATH, "reduce5");
+
+  public static void reduce5(GPUVec args, GPUVec result) {
+
+    int dim = args.dim();
+    final Pointer kernelParameters = Pointer.to(
+        Pointer.to(args.devicePointer),
+        Pointer.to(result.devicePointer),
+        Pointer.to(new int[]{dim}));
+
+    final int blockDim = 32;
+
+    JCudaDriver.cuLaunchKernel(REDUCE5,
+        dim / blockDim, 1, 1,
+        blockDim, 1, 1,
+        blockDim* Sizeof.FLOAT, null,
+        kernelParameters, null
+    );
+    JCudaDriver.cuCtxSynchronize();
+  }
+
   private static final CUfunction F_FILL =
       JCudaHelper.getFunction(CU_FILE_PATH, "fFill");
 

diff --git a/cuda/src/main/java/com/expleague/cuda/data/GPUVec.java b/cuda/src/main/java/com/expleague/cuda/data/GPUVec.java
@@ -90,15 +90,15 @@ public Vec sub(int start, int len) {
     return new GPUVec(len, cUdeviceptr);
   }
 
-  private static float[] convert(double[] doubles) {
+  public static float[] convert(double[] doubles) {
     float[] floats = new float[doubles.length];
     for (int i = 0; i < doubles.length; i++) {
       floats[i] = (float) doubles[i];
     }
     return floats;
   }
 
-  private static double[] convert(float[] floats) {
+  public static double[] convert(float[] floats) {
     double[] doubles = new double[floats.length];
     for (int i = 0; i < doubles.length; i++) {
       doubles[i] = floats[i];