Low global load/storage efficiency - reason

Currently I write a small program from cuda - it renders electrical field potential. I have very low global load & store efficiency I do not know what to do. Nvvp profiler says I have coalesced access for both read/write I do not use local memory, only shared for computations, I do not see any reason. Can someone explain , what am I doing wrong?

__device__ void CudaSum(float *B, int threadsPerPixel, int bIdx, int cIdx,
		unsigned char * C) {
	const float scale = 1.0f;
	const float offset = 128.0f;
	register float force = 0.0f;
	register float val = 0.0f;

	register int l = 0;

	for (l = 0; l < threadsPerPixel; l++)
		force += B[bIdx + l];

	val = scale * force + offset;
	val = max(0.0f, min(255.0f, val));
	C[cIdx] = val;
}

__global__ void CudaFieldCalculate(float *A, int numPoints, int width,
		int height, unsigned char *C, int numElements, float *B,
		int pointsPerThread, int threadsPerPixel) {
	const float scaleX = 0.01f;
	const float scaleY = 0.01f;
	 __shared__ float3 temp[20];

	int pos = blockDim.x * blockIdx.x + threadIdx.x;
	int i = blockDim.y * blockIdx.y + threadIdx.y;
	int j = pos / threadsPerPixel;
	int k = pos - j*threadsPerPixel;


	int elem = i * width + j;
	int cIdx = elem * numElemsPerPoint;
	int bIdx = elem * threadsPerPixel;
	float3 *A1 = (float3 *) A;
	register float diffX, diffY;
	register float force = 0.0f;
	register float3 wars;


	if (i >= height || j >= width)
		return;


	if (threadIdx.x < numPoints)
		temp[threadIdx.x] = A1[threadIdx.x];

	__syncthreads();


	//if (k != 0)
	//	return;

	int limit = min(pointsPerThread, numPoints-k*pointsPerThread);



	for (register int m = 0; m < limit; m++) {
		//wars = temp[k*pointsPerThread+m];
		diffX = (j - temp[k*pointsPerThread+m].x) * scaleX;
		diffY = (i - temp[k*pointsPerThread+m].y) * scaleY;
		force += temp[k*pointsPerThread+m].z / sqrt(diffX * diffX + diffY * diffY);
	}

	return;

	//if (k != 0)
	//	return;

	// musimy przez pami?? dzielon?


	 B[bIdx+k] = force;

	__syncthreads();


	if (k == 0)
		CudaSum(B, threadsPerPixel, bIdx, cIdx, C);

}

Only kernel source

I don’t know how NVVP defines the efficiency, but it is not possible to utilize the full bandwidth of the GPU when each thread does only 8-bit or 16-bit transactions. It seems at least some of the memory accesses in your code are unsigned chars. You might want to look into having the code work with uchar4 instead, where each thread handles four bytes simultaneously.

Side remark: You may want to consider using rsqrt() instead of sqrt() in the field calculation.

I changed unsigned chars to uchar4 It helped a little, but I still have poor global load/store access pattern

__device__ void CudaSum(float *B, int threadsPerPixel, int bIdx, int cIdx,
		unsigned char * C) {
	const float scale = 1.0f;
	const float offset = 128.0f;
	register float force = 0.0f;
	register float val = 0.0f;

	register int l = 0;

	for (l = 0; l < threadsPerPixel; l++)
		force += B[bIdx + l];

	val = scale * force + offset;
	val = max(0.0f, min(255.0f, val));
	C[cIdx] = val;
}

__global__ void CudaFieldCalculate(float *A, int numPoints, int width,
	int height, unsigned char *C, int numElements, float *B,
	int pointsPerThread, int threadsPerPixel) {
	const float scaleX = 0.01f;
	const float scaleY = 0.01f;

	int pos = blockDim.x * blockIdx.x + threadIdx.x;
	int i = blockDim.y * blockIdx.y + threadIdx.y;
	int j = pos / threadsPerPixel;
	int k = pos - j*threadsPerPixel;

	int elem = i * width + j;
	int cIdx = elem * numElemsPerPoint;
	int bIdx = elem * threadsPerPixel;
	float3 *A1 = (float3 *) A;
	register float diffX, diffY;
	register float force = 0.0f;


	if (i >= height || j >= width)
		return;

	//if (threadIdx.x < numPoints)
	//	temp[threadIdx.x] = A1[threadIdx.x];

	// __syncthreads();


	int limit = min(pointsPerThread, numPoints-k*pointsPerThread);



	for (register int m = 0; m < limit; m++) {
		//wars = temp[k*pointsPerThread+m];
		diffX = (j - A1[k*pointsPerThread+m].x) * scaleX;
		diffY = (i - A1[k*pointsPerThread+m].y) * scaleY;
		force += A1[k*pointsPerThread+m].z / sqrt(diffX * diffX + diffY * diffY);
	}




	  B[bIdx+k] = force;

	__syncthreads();


	if (k == 0)
		CudaSum(B, threadsPerPixel, bIdx, cIdx, C);


}

Currently I have such solution - every thread is assigned to one pixel of image, maybe one thread could be assigned to many pixels? Could it help?

  1. matter

B is a array of floats. Floats’ size is 2 bytes - 16 bit? Should I align 2 bytes to 4 bytes - f.e. double every float and access only 1?

–EDIT-- This program should be capable to render 1024x1024 with 100k points ;p Currenyly handles 640x480 - 30 points with difficulties