Is there a way to avoid atomicAdd in my situation?

I’m doing an operation as the figure below.

https://i.stack.imgur.com/nVNdU.jpg

As shown in the figure, I make a small matrix using about one million vectors and accumulate it in a large prepared matrix.

I need an idea that can improve performance without exceeding 8Gb of GPU global memory.

How can I avoid atomic operations? I use the GTX1080. Existing kernels take about 250ms.

Here is my kernel.

__global__ void buildMatrixKernel(const CostJacobianCT *src, const int num, const int st, const int mw_width, double *A, double *b)
{
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	if (idx < num)
	{
		if (src[idx].mask == 1)
		{
			// matrix width 
			int cols = 6 * (mw_width + 1);
			
			// calc position for insert
			int idx0 = (src[idx].fid0 - st);
			if (idx0 == mw_width - 2)
			{
				idx0 = idx0 - 1;
			}
			else if (idx0 == mw_width - 1)
			{
				idx0 = idx0 - 2;
			}

			int idx1 = (src[idx].fid1 - st);
			if (idx1 == mw_width - 2)
			{
				idx1 = idx1 - 1;
			}
			else if (idx1 == mw_width - 1)
			{
				idx1 = idx1 - 2;
			}
			int pos0 = idx0 * 6;
			int pos1 = idx1 * 6;
			
			// set tempolar matrix
			double A00[24 * 24];
			double A11[24 * 24];
			double A01[24 * 24];
			double b0[24];
			double b1[24];
			for (int y = 0; y < 24; y++)
			{
				for (int x = 0; x < 24; x++)
				{
					A00[y * 24 + x] = src[idx].w * src[idx].J0[y] * src[idx].J0[x];
					A11[y * 24 + x] = src[idx].w * src[idx].J1[y] * src[idx].J1[x];
					A01[y * 24 + x] = src[idx].w * src[idx].J0[y] * src[idx].J1[x];
				}
				b0[y] = src[idx].w * src[idx].c * src[idx].J0[y];
				b1[y] = src[idx].w * src[idx].c * src[idx].J1[y];
			}

			// set final matrix
			for (int i = 0; i < 24; i++)
			{
				for (int j = 0; j < 24; j++)
				{
					atomicAdd(&A[(i + pos0) * cols + (j + pos0)], A00[i * 24 + j]); // 00
					atomicAdd(&A[(i + pos1) * cols + (j + pos1)], A11[i * 24 + j]); // 11
					atomicAdd(&A[(i + pos0) * cols + (j + pos1)], A01[i * 24 + j]); // 01
					atomicAdd(&A[(i + pos1) * cols + (j + pos0)], A01[j * 24 + i]); // 10
				}
				atomicAdd(&b[i + pos0], b0[i]); // 0
				atomicAdd(&b[i + pos1], b1[i]); // 1
			}
		}
	}
}

blockwise atomic adds in shared memory could speed things up before using final per block atomics to accumulate in global memory.

If you can fit all per thread results into shared memory, consider using a parallel reduction to perform the summation within the block. It may be slightly faster than atomics.

And you’re using double precision arithmetics on a device that is severely performance limited with 64 bit precision. Is double precision an absolute necessity or can you also work with 32 bit floating point?

Also I am puzzled that you didn’t unroll some of the loops over the matrix elements. Your temp matrices and up getting stored in local memory instead of registers because of this. If those loops get unrolled, they would get stored in faster registers. However those size 24 matrices would require a lot of registers (2 registers per double, so 22424 = 576 registers per matrix. So keeping this in registers will not work. Could A00, A11, A01 be computed one after another in separate loops before adding them to A individually? So they do not have to stored all at the same time.

As a final idea, consider splitting up the work that you currently do in a single thread over multiple threads. e.g. let 24 consecutive threads process one column or row. This will bring down register usage per thread.

Christian

I modified the code so that it does not use the temporal matrix.
The result was a performance increase from 250ms to 130ms.
I will try the rest of the methods if I can understand them. thank you!!

__global__ void buildMatrixKernel(const CostJacobianCT *src, const int num, const int st, const int mw_width, double *A, double *b)
{
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	if (idx < num)
	{
		if (src[idx].mask == 1)
		{
			// matrix width 
			int cols = 6 * (mw_width + 1);
			int pos0 = src[idx].pos0;
			int pos1 = src[idx].pos1;
			double w = src[idx].w;
			double c = src[idx].c;

			double J0[24];
			memcpy(J0, src[idx].J0, sizeof(double) * 24);

			double J1[24];
			memcpy(J1, src[idx].J1, sizeof(double) * 24);
			
			// set final matrix
			for (int i = 0; i < 24; i++)
			{
				for (int j = 0; j < 24; j++)
				{
					atomicAdd(&A[(i + pos0) * cols + (j + pos0)], w * J0[i] * J0[j]); // 00
					atomicAdd(&A[(i + pos1) * cols + (j + pos1)], w * J1[i] * J1[j]); // 11
					atomicAdd(&A[(i + pos0) * cols + (j + pos1)], w * J0[i] * J1[j]); // 01
					atomicAdd(&A[(i + pos1) * cols + (j + pos0)], w * J0[j] * J1[i]); // 10
				}
				atomicAdd(&b[i + pos0], w * c * J0[i]); // 0
				atomicAdd(&b[i + pos1], w * c * J1[i]); // 1
			}
		}
	}
}

loop unrolling version.
130ms -> 99ms.
How do I use shared memory?

__global__ void buildMatrixKernel(const CostJacobianCT *src, const int num, const int st, const int mw_width, double *A, double *b)
{
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	if (idx < num)
	{
		int src_idx = idx / 576;
		if (src[src_idx].mask == 1)
		{
			int cols = 6 * (mw_width + 1);
			int pos0 = src[src_idx].pos0;
			int pos1 = src[src_idx].pos1;
			double w = src[src_idx].w;
			double c = src[src_idx].c;

			int sub_idx = idx % 576;
			int i = sub_idx / 24;
			int j = sub_idx % 24;

			double J0_i = src[src_idx].J0[i];
			double J0_j = src[src_idx].J0[j];
			double J1_i = src[src_idx].J1[i];
			double J1_j = src[src_idx].J1[j];

			atomicAdd(&A[(i + pos0) * cols + (j + pos0)], w * J0_i * J0_j); // 00
			atomicAdd(&A[(i + pos1) * cols + (j + pos1)], w * J1_i * J1_j); // 11
			atomicAdd(&A[(i + pos0) * cols + (j + pos1)], w * J0_i * J1_j); // 01
			atomicAdd(&A[(i + pos1) * cols + (j + pos0)], w * J1_i * J0_j); // 10
			
			if (j == 0)
			{
				atomicAdd(&b[i + pos0], w * c * J0_i); // 0
				atomicAdd(&b[i + pos1], w * c * J1_i); // 1
			}
		}
	}
}