I have been beating my brains out over this for a couple weeks now and could really use some help. I have a largish matrix in which each element is a smaller 2x2 matrix of complex doubles. I need to multiply the 2x2 matrices together over each row to arrive at a vector of 2x2 complex doubles.

I began with a naive algorithm in which each thread loops over all the 2x2s in a row of the main matrix and performs the multiplications in turn. I have been using the visual profiler and trying to improve the speed and efficiency of the algorithm. My system is: I7 / GTX 580 / 64-bit Win 7 with -G and -lineinfo flags enabled. For all times below the large matrix is 5000 rows x 512 columns. The naive kernel runs in 14.6 ms.

So far I have tried:

Registers (44) and shared memory seem to be main candidates for the limiting factors, depending on the launch configuration. The problem is that the 2x2 matrix multiplication requires a lot of balls in the air - 3 sets of 4 complex doubles (for a total of 24 doubles) on top of all the ordinary indexing and looping variables. Exacerbating the problem is the fact that each element in the main matrix consists of 8 doubles, which makes it difficult or impossible to achieve coalesced loads and stores.

Complete program with naive kernel is below:

```
#include "cuComplex.h"
#include <stdio.h>
struct M22DC
{
cuDoubleComplex P11;
cuDoubleComplex P12;
cuDoubleComplex P21;
cuDoubleComplex P22;
};
__device__ __host__ void matMult(M22DC *left, M22DC *right, M22DC *result)
{
result->P11.x = left->P11.x * right->P11.x - left->P11.y * right->P11.y + left->P12.x * right->P21.x - left->P12.y * right->P21.y;
result->P11.y = left->P11.y * right->P11.x + left->P11.x * right->P11.y + left->P12.y * right->P21.x + left->P12.x * right->P21.y;
result->P12.x = left->P11.x * right->P12.x - left->P11.y * right->P12.y + left->P12.x * right->P22.x - left->P12.y * right->P22.y;
result->P12.y = left->P11.y * right->P12.x + left->P11.x * right->P12.y + left->P12.y * right->P22.x + left->P12.x * right->P22.y;
result->P21.x = left->P21.x * right->P11.x - left->P21.y * right->P11.y + left->P22.x * right->P21.x - left->P22.y * right->P21.y;
result->P21.y = left->P21.y * right->P11.x + left->P21.x * right->P11.y + left->P22.y * right->P21.x + left->P22.x * right->P21.y;
result->P22.x = left->P21.x * right->P12.x - left->P21.y * right->P12.y + left->P22.x * right->P22.x - left->P22.y * right->P22.y;
result->P22.y = left->P21.y * right->P12.x + left->P21.x * right->P12.y + left->P22.y * right->P22.x + left->P22.x * right->P22.y;
}
__global__ void matMultRows(M22DC *g_idata, M22DC *g_odata, unsigned int nRows, unsigned int nCols)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int idxRowStart = i * nCols;
if (i >= nRows) return;
M22DC a = g_idata[idxRowStart];
M22DC b;
M22DC c;
for (int j = 1; j < nCols; j++)
{
b = g_idata[idxRowStart + j];
matMult(&a, &b, &c);
a = c;
}
g_odata[idxRowStart] = a;
}
void initializeRandom(M22DC *a, const unsigned int n)
{
double *d = (double *)a;
for (int i = 0; i < n; i++)
{
for (int j = 0; j < 8; j++)
{
d[i * 8 + j] = ((double) rand() / (RAND_MAX));
}
}
}
void showM22DC(M22DC *m)
{
printf("| R=%.6e, I=%.6e R=%.6e, I=%.6e |\n", m->P11.x, m->P11.y, m->P12.x, m->P12.y);
printf("| R=%.6e, I=%.6e R=%.6e, I=%.6e |\n", m->P21.x, m->P21.y, m->P22.x, m->P22.y);
}
bool equivalent(double a, double b)
{
int aexp, bexp;
double as = frexp(a, &aexp);
double bs = frexp(b, &bexp);
return aexp == bexp && abs(as - bs) <= 0.00000001;
}
bool equivalent(M22DC a, M22DC b)
{
return
equivalent(a.P11.x, b.P11.x) &&
equivalent(a.P11.y, b.P11.y) &&
equivalent(a.P12.x, b.P12.x) &&
equivalent(a.P12.y, b.P12.y) &&
equivalent(a.P21.x, b.P21.x) &&
equivalent(a.P21.y, b.P21.y) &&
equivalent(a.P22.x, b.P22.x) &&
equivalent(a.P22.y, b.P22.y);
}
bool checkMatMult(M22DC *cpu, M22DC *gpu, unsigned int rows, unsigned int cols)
{
bool ok = true;
for (int i = 0; i < rows; i++)
{
M22DC a = cpu[i * cols];
M22DC b;
M22DC c;
for (int j = 1; j < cols; j++)
{
b = cpu[i * cols + j];
matMult(&a, &b, &c);
a = c;
}
if (!equivalent(a, gpu[i * cols]))
{
printf("fail at row %i\n", i);
printf("CPU:\n");
showM22DC(&a);
printf("\nGPU:\n");
showM22DC(&gpu[i * cols]);
ok = false;
break;
}
}
return ok;
}
void testMatMultRows()
{
const unsigned int maxThreads = 128;
const unsigned int rows = 5000;
const unsigned int cols = 512;
const unsigned int n = rows * cols;
const unsigned int threads = (rows < maxThreads) ? rows : maxThreads;
const unsigned int blocks = ceil((double)rows / threads);
M22DC *a = (M22DC *)malloc(n * sizeof(M22DC));
M22DC *b = (M22DC *)malloc(n * sizeof(M22DC));
M22DC *dev_a = 0;
M22DC *dev_b = 0;
initializeRandom(a, n);
cudaMalloc((void**)&dev_a, n * sizeof(M22DC));
cudaMalloc((void**)&dev_b, n * sizeof(M22DC));
cudaMemcpy(dev_a, a, n * sizeof(M22DC), cudaMemcpyHostToDevice);
matMultRows<<<blocks, threads>>>(dev_a, dev_b, rows, cols);
cudaMemcpy(b, dev_b, n * sizeof(M22DC), cudaMemcpyDeviceToHost);
if (checkMatMult(a, b, rows, cols)) printf("pass\n");
Error:
if (a) free(a);
if (b) free(b);
if (dev_a) cudaFree(dev_a);
if (dev_b) cudaFree(dev_b);
}
int main(int argc, char *argv[])
{
testMatMultRows();
cudaDeviceReset();
}
```

Any thoughts would be deeply appreciated!

Mike