I have some matrix code that unwraps fringe image phase, then does matrix inversion. This needs to be as fast as possible. There are 76800 (320 x 240 point cloud points) matrices to invert and then vector multiply.
The following code does this and the style is C code array math.
Does converting this to CUDA intrinsics make this faster, or is the compiler already generating SIMD instructions?
// Mc and Mp are each 3 rows by 4 columns float.
// A is 3x3 float, B is 3x1 float.
// uc and vc are float
A[0] = dt->Mc[8] * uc - dt->Mc[0];
A[1] = dt->Mc[9] * uc - dt->Mc[1];
A[2] = dt->Mc[10] * uc - dt->Mc[2];
A[3] = dt->Mc[8] * vc - dt->Mc[4];
A[4] = dt->Mc[9] * vc - dt->Mc[5];
A[5] = dt->Mc[10] * vc - dt->Mc[6];
A[6] = dt->Mp[8] * uphase - dt->Mp[0];
A[7] = dt->Mp[9] * uphase - dt->Mp[1];
A[8] = dt->Mp[10] * uphase - dt->Mp[2];
B[0] = dt->Mc[3] - dt->Mc[11] * uc;
B[1] = dt->Mc[7] - dt->Mc[11] * vc;
B[2] = dt->Mp[3] - dt->Mp[11] * uphase;
// A is 3 x 3, B is 1 x 3, XYZ is 1 x 3
// XYZ = A.inv() * B;
// Invert A
const float det =
A[0] * (A[4] * A[8] - A[7] * A[5]) -
A[1] * (A[3] * A[8] - A[5] * A[6]) +
A[2] * (A[3] * A[7] - A[4] * A[6]);
const float invdet = 1.0f / det;
Ainv[0] = (A[4] * A[8] - A[7] * A[5]) * invdet;
Ainv[1] = (A[2] * A[7] - A[1] * A[8]) * invdet;
Ainv[2] = (A[1] * A[5] - A[2] * A[4]) * invdet;
Ainv[3] = (A[5] * A[6] - A[3] * A[8]) * invdet;
Ainv[4] = (A[0] * A[8] - A[2] * A[6]) * invdet;
Ainv[5] = (A[3] * A[2] - A[0] * A[5]) * invdet;
Ainv[6] = (A[3] * A[7] - A[6] * A[4]) * invdet;
Ainv[7] = (A[6] * A[1] - A[0] * A[7]) * invdet;
Ainv[8] = (A[0] * A[4] - A[3] * A[1]) * invdet;
const float X = (float)(Ainv[0] * B[0] + Ainv[1] * B[1] + Ainv[2] * B[2]);
const float Y = (float)(Ainv[3] * B[0] + Ainv[4] * B[1] + Ainv[5] * B[2]);
const float Z = (float)(Ainv[6] * B[0] + Ainv[7] * B[1] + Ainv[8] * B[2]);
dt->xyzDataOut[idx].Pos[0] = (xflip ? -X : X) + xo;
dt->xyzDataOut[idx].Pos[1] = (yflip ? -Y : Y) + yo;
dt->xyzDataOut[idx].Pos[2] = (zflip ? -Z : Z) + zo;
The out global call is like
phase2XYZ_global <<<2, 512 >>> (uwdp);
and udwp is the unwrap parameters structure.