GPU device calculation for determinants of thousands of small complex matrices on each thread

The problem is to compute the determinants of thousands of small complex matrices (5x5, 4x4, and 3x3) within a device function, that is on each thread. Here is a working example:

#include <stdio.h>
#include <complex.h>
#include <cstring>

#include <cuda.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#define N 5

__device__ cuDoubleComplex det_4x4(cuDoubleComplex *matrix) {
    cuDoubleComplex det = make_cuDoubleComplex(0.0, 0.0);
    cuDoubleComplex det1 = make_cuDoubleComplex(0.0, 0.0);
    cuDoubleComplex det2 = make_cuDoubleComplex(0.0, 0.0);
    cuDoubleComplex det3 = make_cuDoubleComplex(0.0, 0.0);
    cuDoubleComplex det4 = make_cuDoubleComplex(0.0, 0.0);

    // Calculate the determinant using cofactor expansions
    det1 = cuCmul(matrix[0],
                  cuCadd( cuCsub( cuCmul( matrix[5],
                                          cuCsub( cuCmul(matrix[10], matrix[15]),
                                                  cuCmul(matrix[11], matrix[14]) ))
                                  , cuCmul(matrix[6],
                                           cuCsub( cuCmul(matrix[9], matrix[15]),
                                                   cuCmul(matrix[11], matrix[13]) ) ) )
                          , cuCmul(matrix[7],
                                   cuCsub( cuCmul(matrix[9], matrix[14]),
                                           cuCmul(matrix[10], matrix[13]) ) ) ) );



    det2 = cuCmul(matrix[1],
                  cuCadd( cuCsub( cuCmul( matrix[4],
                                          cuCsub( cuCmul(matrix[10], matrix[15]),
                                                  cuCmul(matrix[11], matrix[14]) ))
                                  , cuCmul(matrix[6],
                                           cuCsub( cuCmul(matrix[8], matrix[15]),
                                                   cuCmul(matrix[11], matrix[12]) ) ) )
                          , cuCmul(matrix[7],
                                   cuCsub( cuCmul(matrix[8], matrix[14]),
                                           cuCmul(matrix[10], matrix[12]) ) ) ) );


    det3 = cuCmul(matrix[2],
                  cuCadd( cuCsub( cuCmul( matrix[4],
                                          cuCsub( cuCmul(matrix[9], matrix[15]),
                                                  cuCmul(matrix[11], matrix[13]) ))
                                  , cuCmul(matrix[5],
                                           cuCsub( cuCmul(matrix[8], matrix[15]),
                                                   cuCmul(matrix[11], matrix[12]) ) ) )
                          , cuCmul(matrix[7],
                                   cuCsub( cuCmul(matrix[8], matrix[13]),
                                           cuCmul(matrix[9], matrix[12]) ) ) ) );


    det4 = cuCmul(matrix[3],
                  cuCadd( cuCsub( cuCmul( matrix[4],
                                          cuCsub( cuCmul(matrix[9], matrix[14]),
                                                  cuCmul(matrix[10], matrix[13]) ))
                                  , cuCmul(matrix[5],
                                           cuCsub( cuCmul(matrix[8], matrix[14]),
                                                   cuCmul(matrix[10], matrix[12]) ) ) )
                          , cuCmul(matrix[6],
                                   cuCsub( cuCmul(matrix[8], matrix[13]),
                                           cuCmul(matrix[9], matrix[12]) ) ) ) );

    det = cuCsub( cuCadd(det1, det3), cuCadd(det2, det4) );

    return det;
}


__global__ void det_5x5(cuDoubleComplex *matrix, cuDoubleComplex *res) {
    cuDoubleComplex det = make_cuDoubleComplex(0.0, 0.0);


    printf("\t sum %.16E,%.16E \n",
           cuCreal(matrix[24]),
           cuCimag(matrix[24]));

    // Calculate the determinant using cofactor expansions
    for (int col = 0; col < N; col++) {
        cuDoubleComplex submatrix[N - 1][N - 1];
        int subrow = 0, subcol = 0;

        // Create the submatrix by excluding the current row and column
        for (int row = 1; row < N; row++) {
            for (int c = 0; c < N; c++) {
                if (c == col) {
                    continue;
                }
                submatrix[subrow][subcol] = matrix[row * N + c];
                subcol++;
            }
            subrow++;
            subcol = 0;
        }

        // Calculate the cofactor
        cuDoubleComplex det_sign = make_cuDoubleComplex((col % 2 == 0 ? 1 : -1), 0.0);
        cuDoubleComplex cofactor = cuCmul( cuCmul(det_sign, matrix[col]), det_4x4(&submatrix[0][0]));

        // Accumulate the cofactors to calculate the determinant
        det = cuCadd(det, cofactor);
    }

    res[0] = det;

    return;
}

int main() {
    double complex matrix[N * N] = {
        2 + 2 * I, 3 + 0 * I, 1 + 0 * I, 5 + 0 * I, 7 + 0 * I,
        4 + 0 * I, 5 + 0 * I, 3 + 0 * I, 2 + 0 * I, 8 + 0 * I,
        1 + 0 * I, 3 + 0 * I, 2 + 0 * I, 4 + 0 * I, 3 + 0 * I,
        3 + 0 * I, 4 + 0 * I, 3 + 0 * I, 1 + 0 * I, 2 + 0 * I,
        5 + 0 * I, 6 + 0 * I, 4 + 0 * I, 3 + 0 * I, 2 + 2 * I
    };

    // double complex *matrix;

    double complex *det;
    det = (double complex*)malloc(sizeof(double complex));
    memset(det, 0.0, sizeof(double complex));


    cuDoubleComplex *d_matrix, *d_res;
    cudaMalloc(&d_matrix, sizeof(cuDoubleComplex) * 25 );
    cudaMalloc(&d_res, sizeof(cuDoubleComplex) );

    cudaMemcpy(d_matrix, matrix, sizeof(cuDoubleComplex) * 25,
               cudaMemcpyHostToDevice );


    det_5x5 <<< 1, 1>>>(d_matrix, d_res);


    cudaMemcpy(det, d_res, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost );

    printf("The determinant is: %.16E + %.16E i\n", creal(det[0]), cimag(det[0]));

    return 0;
}

However, I am wondering if this implementation is even efficient. I am seeking more efficient methods to calculate the determinants of small complex matrices (5x5, 4x4, and 3x3) within a device array. Any suggestion or answers would be greatly appreciated!

A good general strategy for operating on many very small consecutively stored matrices is to let each thread handle one complete matrix. It is also often a good idea to copy the elements of a such tiny matrix to scalar variables at the start, perform all computation on the scalars, then write back to the result matrix. At least that used to be the case in the past. The CUDA compiler may have improved to the point where that is no longer necessary; worthy of an experiment.

Obviously, in order to get good performance with this batching approach one needs to keep enough threads busy to fully utilize all GPU resources. With current GPUs this means using on the order of tens of thousands of threads. If you only have thousands of matrices to operate on, the GPU might not be fully utilized by the one-matrix-per-thread approach. At the same time, sharing the computation for each matrix between multiple threads may involve too much overhead.

A general note: Most computations that could be done via determinants have more efficient and/or numerically stable alternative computation approaches. You might want to investigate that aspect further. Computation of determinants is a bit of a red flag in my book.

Side remark: I wonder whether it could be beneficial to performance and / or accuracy to use brute-force computation utilizing cuCfma(). Something like this (untested, written in browser):

__device__ cuDoubleComplex det_4x4 (cuDoubleComplex *a) 
{
    cuDoubleComplex det;

    // adjust as needed for row-major vs column-major storage
    cuDoubleComplex a11 = a[0];
    cuDoubleComplex a12 = a[1];
    cuDoubleComplex a13 = a[2];
    cuDoubleComplex a14 = a[3];
    cuDoubleComplex a21 = a[4];
    cuDoubleComplex a22 = a[5];
    cuDoubleComplex a23 = a[6];
    cuDoubleComplex a24 = a[7];
    cuDoubleComplex a31 = a[8];
    cuDoubleComplex a32 = a[9];
    cuDoubleComplex a33 = a[10];
    cuDoubleComplex a34 = a[11];
    cuDoubleComplex a41 = a[12];
    cuDoubleComplex a42 = a[13];
    cuDoubleComplex a43 = a[14];
    cuDoubleComplex a44 = a[15];

    cuDoubleComplex a11_a22 = cuCmul (a11, a22);
    cuDoubleComplex a11_a23 = cuCmul (a11, a23);
    cuDoubleComplex a11_a24 = cuCmul (a11, a24);
    cuDoubleComplex a12_a21 = cuCmul (a12, a21);
    cuDoubleComplex a12_a23 = cuCmul (a12, a23);
    cuDoubleComplex a12_a24 = cuCmul (a12, a24);
    cuDoubleComplex a13_a21 = cuCmul (a13, a21);
    cuDoubleComplex a13_a22 = cuCmul (a13, a22);
    cuDoubleComplex a13_a24 = cuCmul (a13, a24);
    cuDoubleComplex a14_a21 = cuCmul (a14, a21);
    cuDoubleComplex a14_a22 = cuCmul (a14, a22);
    cuDoubleComplex a14_a23 = cuCmul (a14, a23);
    cuDoubleComplex a31_a42 = cuCmul (a31, a42);
    cuDoubleComplex a31_a43 = cuCmul (a31, a43);
    cuDoubleComplex a31_a44 = cuCmul (a31, a44);
    cuDoubleComplex a32_a41 = cuCmul (a32, a41);
    cuDoubleComplex a32_a43 = cuCmul (a32, a43);
    cuDoubleComplex a32_a44 = cuCmul (a32, a44);
    cuDoubleComplex a33_a41 = cuCmul (a33, a41);
    cuDoubleComplex a33_a42 = cuCmul (a33, a42);
    cuDoubleComplex a33_a44 = cuCmul (a33, a44);
    cuDoubleComplex a34_a41 = cuCmul (a34, a41);
    cuDoubleComplex a34_a42 = cuCmul (a34, a42);
    cuDoubleComplex a34_a43 = cuCmul (a34, a43);

    det = cuCmul ( a11_a22, a33_a44);
    det = cuCfma ( a11_a24, a32_a43, det);
    det = cuCfma ( a11_a23, a34_a42, det);
    det = cuCfma (-a11_a24, a33_a42, det);
    det = cuCfma (-a11_a22, a34_a43, det);
    det = cuCfma (-a11_a23, a32_a44, det);
    det = cuCfma (-a12_a21, a33_a44, det);
    det = cuCfma (-a12_a23, a34_a41, det);
    det = cuCfma (-a12_a24, a31_a43, det);
    det = cuCfma ( a12_a24, a33_a41, det);
    det = cuCfma ( a12_a21, a34_a43, det);
    det = cuCfma ( a12_a23, a31_a44, det);
    det = cuCfma ( a13_a21, a32_a44, det);
    det = cuCfma ( a13_a22, a34_a41, det);
    det = cuCfma ( a13_a24, a31_a42, det);
    det = cuCfma (-a13_a24, a32_a41, det);
    det = cuCfma (-a13_a21, a34_a42, det);
    det = cuCfma (-a13_a22, a31_a44, det);
    det = cuCfma (-a14_a21, a32_a43, det);
    det = cuCfma (-a14_a22, a33_a41, det);
    det = cuCfma (-a14_a23, a31_a42, det);
    det = cuCfma ( a14_a23, a32_a41, det);
    det = cuCfma ( a14_a21, a33_a42, det);
    det = cuCfma ( a14_a22, a31_a43, det);

    return det;
}

Come to think of it, something like det = cuCfma (-a11_a24, a33_a42, det); may not be expressible given CUDA’s rudimentary support for complex numbers, and one may have to program one’s own cuCfnma() (which computes -a*b+c) first, deriving it from the existing cuCfma() code in cuComplex.h.

@njuffa I did not explain it better earlier. It is already on a single thread. Every thread is computing these thousand determinants. I am looking for efficient ways to compute these determinant from the device side

This is the aspect for which I asked the question.

“A general note: Most computations that could be done via determinants have more efficient and/or numerically stable alternative computation approaches. You might want to investigate that aspect further. Computation of determinants is a bit of a red flag in my book.”

If you are looking for possible alternatives to computing these determinants, it would be important to know the context of the computation, which has not been revealed so far from what I can see.

Solvers or factorizations of some sort (for example LU or Cholesky) may offer a more advantageous way of computing whatever needs to be computed. For some use cases one is only interested in the sign of determinants and actually computing determinants in ordinary precision does not reliably provide that due to numerical issues.

So if the larger goal of this computation was known and what these matrices represent in context, together with anything that is known about the structure of these matrices and limitations on the matrix elements that one could take advantage of, that might be a basis for discussion about the most efficient alternative way to compute whatever needs computing. Of course it could turn out that this is a question more for domain experts than CUDA experts.

@njuffa I don’t think the context is necessary here. I am looking for an alternative method to compute the determinant of a general matrix inside a device function. The example I have provided is one of the most straightforward methods, but an alternative approach, such as LU/Cholesky decomposition, that can be called from the device function is what I am seeking. Unfortunately, packages like LAPACK are not available for CUDA, but maybe a similar compatible function from some other packages could work within the CUDA device environment.

I don’t think one would want to use LAPACK on matrices this small, on any platform. For example, if one wants to transform the matrix to upper triangular, then compute the determinant from the diagonal, that can easily be programmed out in a few lines of code as a device function.

My suggestion would be to set up a nice scaffold for benchmarking (and accuracy, if needed) , then try the various methods of computing determinants one by one, starting with the simplest (the brute-force scheme I posted above). After a day of work you will know which method of computing determinants works best.

@njuffa
I understand your perspective, but that’s precisely what I’m seeking clarification on. It would be beneficial to have a predefined library and functions for comparison purposes. If you have any alternative solutions (aside from the brute-force approach, which resembles the example I shared), please feel free to proceed and share an example.