static void | 
LibMatrixCUDA.abs(ExecutionContext ec,
   GPUContext gCtx,
   String instName,
   MatrixObject in1,
   String outputName) | 
 Performs an "abs" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCUDA.acos(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | 
 Performs an "acos" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCUDA.asin(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | 
 Performs an "asin" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCUDA.atan(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | 
 Performs an "atan" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCUDA.axpy(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    MatrixObject in2,
    String outputName,
    double constant) | 
 Performs daxpy operation 
 | 
static void | 
LibMatrixCuDNN.batchNormalizationBackward(GPUContext gCtx,
                          String instName,
                          MatrixObject image,
                          MatrixObject dout,
                          MatrixObject scale,
                          MatrixObject dX,
                          MatrixObject dScale,
                          MatrixObject dBias,
                          double epsilon,
                          MatrixObject resultSaveMean,
                          MatrixObject resultSaveInvVariance) | 
 This method computes the backpropagation errors for image, scale and bias of batch normalization layer 
 | 
static void | 
LibMatrixCuDNN.batchNormalizationForwardInference(GPUContext gCtx,
                                  String instName,
                                  MatrixObject image,
                                  MatrixObject scale,
                                  MatrixObject bias,
                                  MatrixObject runningMean,
                                  MatrixObject runningVar,
                                  MatrixObject ret,
                                  double epsilon) | 
 Performs the forward BatchNormalization layer computation for inference 
 | 
static void | 
LibMatrixCuDNN.batchNormalizationForwardTraining(GPUContext gCtx,
                                 String instName,
                                 MatrixObject image,
                                 MatrixObject scale,
                                 MatrixObject bias,
                                 MatrixObject runningMean,
                                 MatrixObject runningVar,
                                 MatrixObject ret,
                                 MatrixObject retRunningMean,
                                 MatrixObject retRunningVar,
                                 double epsilon,
                                 double exponentialAverageFactor,
                                 MatrixObject resultSaveMean,
                                 MatrixObject resultSaveInvVariance) | 
 Performs the forward BatchNormalization layer computation for training 
 | 
static void | 
LibMatrixCUDA.biasAdd(GPUContext gCtx,
       String instName,
       MatrixObject input,
       MatrixObject bias,
       MatrixObject outputBlock) | 
 Performs the operation corresponding to the DML script:
 ones = matrix(1, rows=1, cols=Hout*Wout)
 output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
 This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function 
 | 
static void | 
LibMatrixCUDA.biasMultiply(GPUContext gCtx,
            String instName,
            MatrixObject input,
            MatrixObject bias,
            MatrixObject outputBlock) | 
 Performs the operation corresponding to the DML script:
 ones = matrix(1, rows=1, cols=Hout*Wout)
 output = input * matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
 This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function 
 | 
static void | 
LibMatrixCUDA.cbind(ExecutionContext ec,
     GPUContext gCtx,
     String instName,
     MatrixObject in1,
     MatrixObject in2,
     String outputName) | 
  | 
static void | 
LibMatrixCUDA.ceil(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | 
 Performs an "ceil" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCUDA.channelSums(GPUContext gCtx,
           String instName,
           MatrixObject input,
           MatrixObject outputBlock,
           long C,
           long HW) | 
 Perform channel_sums operations: out = rowSums(matrix(colSums(A), rows=C, cols=HW)) 
 | 
static int | 
LibMatrixCUDA.computeNNZ(GPUContext gCtx,
          jcuda.Pointer densePtr,
          int length) | 
 Utility to compute number of non-zeroes on the GPU 
 | 
static void | 
LibMatrixCuDNN.conv2d(GPUContext gCtx,
      String instName,
      MatrixObject image,
      MatrixObject filter,
      MatrixObject outputBlock,
      int N,
      int C,
      int H,
      int W,
      int K,
      int R,
      int S,
      int pad_h,
      int pad_w,
      int stride_h,
      int stride_w,
      int P,
      int Q,
      double intermediateMemoryBudget) | 
 Performs a 2D convolution 
 | 
static void | 
LibMatrixCuDNN.conv2dBackwardData(GPUContext gCtx,
                  String instName,
                  MatrixObject filter,
                  MatrixObject dout,
                  MatrixObject output,
                  int N,
                  int C,
                  int H,
                  int W,
                  int K,
                  int R,
                  int S,
                  int pad_h,
                  int pad_w,
                  int stride_h,
                  int stride_w,
                  int P,
                  int Q,
                  double intermediateMemoryBudget) | 
 This method computes the backpropogation errors for previous layer of convolution operation 
 | 
static void | 
LibMatrixCuDNN.conv2dBackwardFilter(GPUContext gCtx,
                    String instName,
                    MatrixObject image,
                    MatrixObject dout,
                    MatrixObject outputBlock,
                    int N,
                    int C,
                    int H,
                    int W,
                    int K,
                    int R,
                    int S,
                    int pad_h,
                    int pad_w,
                    int stride_h,
                    int stride_w,
                    int P,
                    int Q,
                    double intermediateMemoryBudget) | 
 This method computes the backpropogation errors for filter of convolution operation 
 | 
static void | 
LibMatrixCuDNN.conv2dBiasAdd(GPUContext gCtx,
             String instName,
             MatrixObject image,
             MatrixObject bias,
             MatrixObject filter,
             MatrixObject output,
             int N,
             int C,
             int H,
             int W,
             int K,
             int R,
             int S,
             int pad_h,
             int pad_w,
             int stride_h,
             int stride_w,
             int P,
             int Q,
             double intermediateMemoryBudget) | 
 Does a 2D convolution followed by a bias_add 
 | 
static void | 
LibMatrixCUDA.cos(ExecutionContext ec,
   GPUContext gCtx,
   String instName,
   MatrixObject in1,
   String outputName) | 
 Performs an "cos" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCUDA.cosh(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | 
 Performs an "cosh" operation on a matrix on the GPU 
 | 
static LibMatrixCuDNNConvolutionAlgorithm | 
LibMatrixCuDNNConvolutionAlgorithm.cudnnGetConvolutionBackwardDataAlgorithm(GPUContext gCtx,
                                        String instName,
                                        int N,
                                        int C,
                                        int H,
                                        int W,
                                        int K,
                                        int R,
                                        int S,
                                        int pad_h,
                                        int pad_w,
                                        int stride_h,
                                        int stride_w,
                                        int P,
                                        int Q,
                                        long workspaceLimit) | 
 Factory method to get the algorithm wrapper for convolution backward data 
 | 
static LibMatrixCuDNNConvolutionAlgorithm | 
LibMatrixCuDNNConvolutionAlgorithm.cudnnGetConvolutionBackwardFilterAlgorithm(GPUContext gCtx,
                                          String instName,
                                          int N,
                                          int C,
                                          int H,
                                          int W,
                                          int K,
                                          int R,
                                          int S,
                                          int pad_h,
                                          int pad_w,
                                          int stride_h,
                                          int stride_w,
                                          int P,
                                          int Q,
                                          long workspaceLimit) | 
 Factory method to get the algorithm wrapper for convolution backward filter 
 | 
static LibMatrixCuDNNConvolutionAlgorithm | 
LibMatrixCuDNNConvolutionAlgorithm.cudnnGetConvolutionForwardAlgorithm(GPUContext gCtx,
                                   String instName,
                                   int N,
                                   int C,
                                   int H,
                                   int W,
                                   int K,
                                   int R,
                                   int S,
                                   int pad_h,
                                   int pad_w,
                                   int stride_h,
                                   int stride_w,
                                   int P,
                                   int Q,
                                   long workspaceLimit) | 
 Factory method to get the algorithm wrapper for convolution forward 
 | 
static LibMatrixCuDNNPoolingDescriptors | 
LibMatrixCuDNNPoolingDescriptors.cudnnPoolingBackwardDescriptors(GPUContext gCtx,
                               String instName,
                               int N,
                               int C,
                               int H,
                               int W,
                               int K,
                               int R,
                               int S,
                               int pad_h,
                               int pad_w,
                               int stride_h,
                               int stride_w,
                               int P,
                               int Q,
                               LibMatrixDNN.PoolingType poolingType) | 
 Get descriptors for maxpooling backward operation 
 | 
static LibMatrixCuDNNPoolingDescriptors | 
LibMatrixCuDNNPoolingDescriptors.cudnnPoolingDescriptors(GPUContext gCtx,
                       String instName,
                       int N,
                       int C,
                       int H,
                       int W,
                       int K,
                       int R,
                       int S,
                       int pad_h,
                       int pad_w,
                       int stride_h,
                       int stride_w,
                       int P,
                       int Q,
                       LibMatrixDNN.PoolingType poolingType) | 
 Get descriptors for maxpooling operation 
 | 
static void | 
LibMatrixCUDA.cumulativeScan(ExecutionContext ec,
              GPUContext gCtx,
              String instName,
              String kernelFunction,
              MatrixObject in,
              String outputName) | 
 Cumulative scan 
 | 
static void | 
LibMatrixCUDA.cumulativeSumProduct(ExecutionContext ec,
                    GPUContext gCtx,
                    String instName,
                    String kernelFunction,
                    MatrixObject in,
                    String outputName) | 
 Cumulative sum-product kernel cascade invokation 
 | 
static void | 
LibMatrixCUDA.denseTranspose(ExecutionContext ec,
              GPUContext gCtx,
              String instName,
              jcuda.Pointer A,
              jcuda.Pointer C,
              long numRowsA,
              long numColsA) | 
 Computes C = t(A) 
 | 
void | 
CudaSupportFunctions.deviceToHost(GPUContext gCtx,
            jcuda.Pointer src,
            double[] dest,
            String instName,
            boolean isEviction) | 
  | 
void | 
DoublePrecisionCudaSupportFunctions.deviceToHost(GPUContext gCtx,
            jcuda.Pointer src,
            double[] dest,
            String instName,
            boolean isEviction) | 
  | 
void | 
SinglePrecisionCudaSupportFunctions.deviceToHost(GPUContext gCtx,
            jcuda.Pointer src,
            double[] dest,
            String instName,
            boolean isEviction) | 
  | 
static jcuda.Pointer | 
LibMatrixCUDA.double2float(GPUContext gCtx,
            jcuda.Pointer A,
            jcuda.Pointer ret,
            int numElems) | 
  | 
static void | 
LibMatrixCUDA.exp(ExecutionContext ec,
   GPUContext gCtx,
   String instName,
   MatrixObject in1,
   String outputName) | 
 Performs an "exp" operation on a matrix on the GPU 
 | 
static jcuda.Pointer | 
LibMatrixCUDA.float2double(GPUContext gCtx,
            jcuda.Pointer A,
            jcuda.Pointer ret,
            int numElems) | 
  | 
static void | 
LibMatrixCUDA.floor(ExecutionContext ec,
     GPUContext gCtx,
     String instName,
     MatrixObject in1,
     String outputName) | 
 Performs an "floor" operation on a matrix on the GPU 
 | 
static JCudaKernels | 
LibMatrixCUDA.getCudaKernels(GPUContext gCtx) | 
  | 
static jcuda.Pointer | 
LibMatrixCUDA.getDensePointer(GPUContext gCtx,
               MatrixObject input,
               String instName) | 
 Convenience method to get jcudaDenseMatrixPtr. 
 | 
static jcuda.Pointer | 
LibMatrixCuDNN.getDensePointerForCuDNN(GPUContext gCtx,
                       MatrixObject image,
                       String instName,
                       int numRows,
                       int numCols) | 
 Convenience method to get jcudaDenseMatrixPtr. 
 | 
static long | 
LibMatrixCUDA.getNnz(GPUContext gCtx,
      String instName,
      MatrixObject mo,
      boolean recomputeDenseNNZ) | 
 Note: if the matrix is in dense format, it explicitly re-computes the number of nonzeros. 
 | 
void | 
CudaSupportFunctions.hostToDevice(GPUContext gCtx,
            double[] src,
            jcuda.Pointer dest,
            String instName) | 
  | 
void | 
DoublePrecisionCudaSupportFunctions.hostToDevice(GPUContext gCtx,
            double[] src,
            jcuda.Pointer dest,
            String instName) | 
  | 
void | 
SinglePrecisionCudaSupportFunctions.hostToDevice(GPUContext gCtx,
            double[] src,
            jcuda.Pointer dest,
            String instName) | 
  | 
static boolean | 
LibMatrixCUDA.isInSparseFormat(GPUContext gCtx,
                MatrixObject mo) | 
  | 
static void | 
LibMatrixCUDA.log(ExecutionContext ec,
   GPUContext gCtx,
   String instName,
   MatrixObject in1,
   String outputName) | 
 Performs an "log" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCuDNN.lstm(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    jcuda.Pointer X,
    jcuda.Pointer wPointer,
    jcuda.Pointer out0,
    jcuda.Pointer c0,
    boolean return_sequences,
    String outputName,
    String cyName,
    int N,
    int M,
    int D,
    int T) | 
 Computes the forward pass for an LSTM layer with M neurons. 
 | 
static void | 
LibMatrixCuDNN.lstmBackward(ExecutionContext ec,
            GPUContext gCtx,
            String instName,
            jcuda.Pointer x,
            jcuda.Pointer hx,
            jcuda.Pointer cx,
            jcuda.Pointer wPointer,
            String doutName,
            String dcyName,
            String dxName,
            String dwName,
            String dbName,
            String dhxName,
            String dcxName,
            boolean return_sequences,
            int N,
            int M,
            int D,
            int T) | 
  | 
static MatrixObject | 
LibMatrixCuMatMult.matmult(ExecutionContext ec,
       GPUContext gCtx,
       String instName,
       MatrixObject left,
       MatrixObject right,
       String outputName,
       boolean isLeftTransposed,
       boolean isRightTransposed) | 
 Matrix multiply on GPU Examines sparsity and shapes and routes call to
 appropriate method from cuBLAS or cuSparse C = op(A) x op(B)
 The user is expected to call
 ec.releaseMatrixOutputForGPUInstruction(outputName); 
 | 
static void | 
LibMatrixCUDA.matmultTSMM(ExecutionContext ec,
           GPUContext gCtx,
           String instName,
           MatrixObject left,
           String outputName,
           boolean isLeftTransposed) | 
 Performs tsmm, A %*% A' or A' %*% A, on GPU by exploiting cublasDsyrk(...) 
 | 
static void | 
LibMatrixCUDA.matrixMatrixArithmetic(ExecutionContext ec,
                      GPUContext gCtx,
                      String instName,
                      MatrixObject in1,
                      MatrixObject in2,
                      String outputName,
                      boolean isLeftTransposed,
                      boolean isRightTransposed,
                      BinaryOperator op) | 
 Performs elementwise arithmetic operation specified by op of two input matrices in1 and in2 
 | 
static void | 
LibMatrixCUDA.matrixMatrixRelational(ExecutionContext ec,
                      GPUContext gCtx,
                      String instName,
                      MatrixObject in1,
                      MatrixObject in2,
                      String outputName,
                      BinaryOperator op) | 
 Performs elementwise operation relational specified by op of two input matrices in1 and in2 
 | 
static void | 
LibMatrixCUDA.matrixScalarArithmetic(ExecutionContext ec,
                      GPUContext gCtx,
                      String instName,
                      MatrixObject in,
                      String outputName,
                      boolean isInputTransposed,
                      ScalarOperator op) | 
 Entry point to perform elementwise matrix-scalar arithmetic operation specified by op 
 | 
static void | 
LibMatrixCUDA.matrixScalarOp(ExecutionContext ec,
              GPUContext gCtx,
              String instName,
              MatrixObject in,
              String outputName,
              boolean isInputTransposed,
              ScalarOperator op) | 
 Utility to do matrix-scalar operation kernel 
 | 
static void | 
LibMatrixCUDA.matrixScalarRelational(ExecutionContext ec,
                      GPUContext gCtx,
                      String instName,
                      MatrixObject in,
                      String outputName,
                      ScalarOperator op) | 
 Entry point to perform elementwise matrix-scalar relational operation specified by op 
 | 
static void | 
LibMatrixCuDNN.pooling(GPUContext gCtx,
       String instName,
       MatrixObject image,
       MatrixObject outputBlock,
       int N,
       int C,
       int H,
       int W,
       int K,
       int R,
       int S,
       int pad_h,
       int pad_w,
       int stride_h,
       int stride_w,
       int P,
       int Q,
       LibMatrixDNN.PoolingType poolingType,
       double intermediateMemoryBudget) | 
 performs maxpooling on GPU by exploiting cudnnPoolingForward(...) 
 | 
static void | 
LibMatrixCuDNN.poolingBackward(GPUContext gCtx,
               String instName,
               MatrixObject image,
               MatrixObject dout,
               MatrixObject maxpoolOutput,
               MatrixObject outputBlock,
               int N,
               int C,
               int H,
               int W,
               int K,
               int R,
               int S,
               int pad_h,
               int pad_w,
               int stride_h,
               int stride_w,
               int P,
               int Q,
               LibMatrixDNN.PoolingType poolingType,
               double intermediateMemoryBudget) | 
 Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...)
 This method computes the backpropogation errors for previous layer of maxpooling operation 
 | 
static void | 
LibMatrixCUDA.rbind(ExecutionContext ec,
     GPUContext gCtx,
     String instName,
     MatrixObject in1,
     MatrixObject in2,
     String outputName) | 
  | 
static void | 
LibMatrixCuDNN.relu(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in,
    String outputName) | 
 Performs the relu operation on the GPU. 
 | 
static void | 
LibMatrixCUDA.reluBackward(GPUContext gCtx,
            String instName,
            MatrixObject input,
            MatrixObject dout,
            MatrixObject outputBlock) | 
 This method computes the backpropagation errors for previous layer of relu operation 
 | 
static void | 
LibMatrixCUDA.round(ExecutionContext ec,
     GPUContext gCtx,
     String instName,
     MatrixObject in1,
     String outputName) | 
 Performs an "round" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCUDA.sigmoid(ExecutionContext ec,
       GPUContext gCtx,
       String instName,
       MatrixObject in1,
       String outputName) | 
 Performs an "sigmoid" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCUDA.sign(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | 
 Performs an "sign" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCUDA.sin(ExecutionContext ec,
   GPUContext gCtx,
   String instName,
   MatrixObject in1,
   String outputName) | 
 Performs an "sin" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCUDA.sinh(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | 
 Performs an "sinh" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCUDA.sliceOperations(ExecutionContext ec,
               GPUContext gCtx,
               String instName,
               MatrixObject in1,
               IndexRange ixrange,
               String outputName) | 
 Method to perform rightIndex operation for a given lower and upper bounds in row and column dimensions. 
 | 
static void | 
LibMatrixCuDNN.softmax(ExecutionContext ec,
       GPUContext gCtx,
       String instName,
       MatrixObject in1,
       String outputName) | 
 Performs an "softmax" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCUDA.solve(ExecutionContext ec,
     GPUContext gCtx,
     String instName,
     MatrixObject in1,
     MatrixObject in2,
     String outputName) | 
 Implements the "solve" function for systemds Ax = B (A is of size m*n, B is of size m*1, x is of size n*1) 
 | 
static void | 
LibMatrixCUDA.sqrt(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | 
 Performs an "sqrt" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCUDA.tan(ExecutionContext ec,
   GPUContext gCtx,
   String instName,
   MatrixObject in1,
   String outputName) | 
 Performs an "tan" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCUDA.tanh(ExecutionContext ec,
    GPUContext gCtx,
    String instName,
    MatrixObject in1,
    String outputName) | 
 Performs an "tanh" operation on a matrix on the GPU 
 | 
static void | 
LibMatrixCUDA.transpose(ExecutionContext ec,
         GPUContext gCtx,
         String instName,
         MatrixObject in,
         String outputName) | 
 Transposes the input matrix using cublasDgeam 
 | 
static void | 
LibMatrixCUDA.unaryAggregate(ExecutionContext ec,
              GPUContext gCtx,
              String instName,
              MatrixObject in1,
              String output,
              AggregateUnaryOperator op) | 
 Entry point to perform Unary aggregate operations on the GPU. 
 |