# Pycuda - Adapt existing code and Kernel code to perform a high number of 3x3 matrix inversion

Following a previous question ( https://stackoverflow.com/questions/55007384/pycuda-best-way-to-perform-a-high-number-of-small-matrix-inversion-cublas-or/55085096 ), considering the inversion of 4x4 matrix, I would like to do the same but with 3x3 matrix. As @Robert Crovella said, this change implies a complete rewrite.

I have modified the dimension and used direct formula from the link given by @Robert Crovella (https://ardoris.wordpress.com/2008/07/18/general-formula-for-the-inverse-of-a-3x3-matrix/) but without success; here the code modified :

``````import numpy as np
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit

# kernel of 3x3 inversion
kernel_3x3 = SourceModule("""

// in-place is acceptable i.e. out == in)
// T = float or double only

typedef float T;
__global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){

size_t idx = ix + blockDim.x*blockIdx.x;
if (ix < n*9){
T det = in[0+idx]*(in[4+idx]*in[8+idx]-in[7+idx]*in[5+idx]) - in[1+idx]*(in[3+idx]*in[8+idx]-in[6+idx]*in[5+idx]) + in[2+idx]*(in[3+idx]*in[7+idx]-in[6+idx]*in[4+idx]);
out[0+idx] = (in[4+idx]*in[8+idx]-in[7+idx]*in[5+idx])/det;
out[1+idx] = (in[2+idx]*in[7+idx]-in[1+idx]*in[8+idx])/det;
out[2+idx] = (in[1+idx]*in[5+idx]-in[2+idx]*in[4+idx])/det;
out[3+idx] = (in[6+idx]*in[5+idx]-in[3+idx]*in[8+idx])/det;
out[4+idx] = (in[0+idx]*in[8+idx]-in[2+idx]*in[6+idx])/det;
out[5+idx] = (in[2+idx]*in[3+idx]-in[0+idx]*in[5+idx])/det;
out[6+idx] = (in[3+idx]*in[7+idx]-in[4+idx]*in[6+idx])/det;
out[7+idx] = (in[1+idx]*in[6+idx]-in[0+idx]*in[7+idx])/det;
out[8+idx] = (in[0+idx]*in[4+idx]-in[1+idx]*in[3+idx])/det;
__syncwarp();
}
}

""")

def gpuinv3x3 (inp, n):
# internal constants not to be modified
hpat = ( 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50, 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14, 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618)
# Convert parameters into numpy array
inpd = np.array(inp, dtype=np.float32)
hpatd = np.array(hpat, dtype=np.uint32)
output = np.empty((n*9), dtype= np.float32)
# Get kernel function
matinv3x3 = kernel_3x3.get_function("inv3x3")
# Define block, grid and compute
blockDim = (81,1,1) # do not change
gridDim = ((n/9)+1,1,1)
# Kernel function
matinv3x3 (
cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
block=blockDim, grid=gridDim)
return output
#example/test case
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv3x3(inp, n)
print(result.reshape(2,3,3))
``````

The first matrix is correctly inverted but not the second one (the identity matrix which has identity matrix as inverse) :

``````[[[ 2.         -0.         -1.        ]
[-1.         -0.33333334  1.        ]
[-0.          0.33333334 -0.        ]]

[[ 1.         -0.5        -0.        ]
[       -inf  1.         -1.        ]
[        nan         nan  1.        ]]]
``````

So, this issue doesn’t seem to come from the kernel code but rather the batch size or something similar with dimensions of global 1D array (in my code, you can see the 2 3x3 matrices formatted as 1D array of 18 elements ( `inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)` ).

Anyone could see what’s wrong in this code ? Especially, the issue of bad inversion on the second matrix . Just a last point, an odd size of group doesn’t imply issues for GPU processing ?

Regards