High cost writing the global memory

Hi, I have the following kernel.

__global__ void gCalcNNDists(float *tss, double *nnDistsComplete, int *numShAll,
	const int numTrain, const int numPLabeled, const int tsLen, const int numSLens,
	const int minSLen, const int maxSLen, const int sLenStep){

	int tid = threadIdx.x;
	int blockId = blockIdx.x;
	int blockSize = blockDim.x;

	extern __shared__ float array[];
	int *sharedSeed = (int *)array;
	int *sharedNumShAll = (int *)&sharedSeed[numPLabeled];
	float *sharedDistMem = (float *)&sharedNumShAll[numSLens];

	float *ts1 = (float*)array;
	float *ts2 = (float*)&ts1[tsLen];
	int *buf = (int*)&ts2[tsLen];
	int *buf_0 = (int*)&buf[tsLen * 3];

	int i, j, k, start, numIters = ceil((double)numSLens / blockSize);
	for (i = 0; i < numIters; i++){
		start = i * blockSize + tid;
		if (start < numSLens)
			sharedNumShAll[start] = numShAll[start];
	}
	__syncthreads();

	double nnDist, term1, term2, s1, s1_2, mu1, sigma1, mu1_0, mu1_1, sigma1_0, sigma1_1, s2, s2_2, mu2, sigma2, corr, dotPr, dotPr_0, dotPr_1, nnCorr, nnCorr_0, nnCorr_1;
	int sLen, numSh, numSh_0, shId, idx, cnt, nextSh, idx_1, idx_2, numCharPerSh, elm_0, elm_1, elm_2;
	numCharPerSh = ceil((double)numTrain / (BITS_PER_BYTE * sizeof(char)));
	numSh_0 = sharedNumShAll[0];
	cnt = nextSh = 0;
	for (sLen = minSLen; sLen <= maxSLen; sLen += sLenStep){
		numSh = sharedNumShAll[cnt++];

		numIters = ceil((double)tsLen / blockSize);
		for (i = 0; i < numIters; i++){
			start = i * blockSize + tid;
			if (start < tsLen)
				ts1[start] = tss[blockId * tsLen + start];
		}
		__syncthreads();

		numSh = gridDim.x * (tsLen - sLen + 1);
		for (i = 0; i < numIters; i++){
			start = i * blockSize + tid;	
			if (start < tsLen - sLen + 1){	
				s1 = s1_2 = 0;
				for (k = 0; k < sLen; k++){
					term1 = ts1[start + k];
					s1 += term1;
					s1_2 += term1 * term1;
				}
				mu1 = s1 / sLen;
				sigma1 = s1_2 / sLen > mu1 * mu1 ? sqrt(s1_2 / sLen - mu1 * mu1) : 1;
				if (i == 0){
					mu1_0 = mu1;
					sigma1_0 = sigma1;
				}
				else{
					mu1_1 = mu1;
					sigma1_1 = sigma1;
				}
			}
		}

		for (j = 0; j < numTrain; j++){	

			for (i = 0; i < numIters; i++){
				start = i * blockSize + tid;
				if (start < tsLen)
					ts2[start] = tss[j * tsLen + start];
			}
			__syncthreads();

			for (i = 0; i < numIters; i++){
				start = i * blockSize + tid;
				if (start < tsLen - sLen + 1){
					s2 = s2_2 = 0;
					for (k = 0; k < sLen; k++){
						term2 = ts2[k];
						s2 += term2;
						s2_2 += term2 * term2;
					}
					mu2 = s2 / sLen;
					sigma2 = s2_2 / sLen > mu2 * mu2 ? sqrt(s2_2 / sLen - mu2 * mu2) : 1;

					dotPr = 0;
					for (k = 0; k < sLen; k++){
						term1 = ts1[k];
						term2 = ts2[start + k];
						dotPr += term1 * term2;
					}
					decomposeDouble(dotPr, elm_0, elm_1, elm_2);
					buf_0[3 * start] = elm_0;
					buf_0[3 * start + 1] = elm_1;
					buf_0[3 * start + 2] = elm_2;

					dotPr = 0;
					for (k = 0; k < sLen; k++){
						term1 = ts1[start + k];
						term2 = ts2[k];
						dotPr += term1 * term2;
					}
					decomposeDouble(dotPr, elm_0, elm_1, elm_2);
					buf[3 * start] = elm_0;
					buf[3 * start + 1] = elm_1;
					buf[3 * start + 2] = elm_2;

					if(i == 0)
						nnCorr_0 = (dotPr - sLen * mu1_0 * mu2) / (sLen * sigma1_0 * sigma2);	
					else
						nnCorr_1 = (dotPr - sLen * mu1_1 * mu2) / (sLen * sigma1_1 * sigma2);
				}
			}
			__syncthreads();

			for (k = 1; k < tsLen - sLen + 1; k++){
				term1 = ts2[k + sLen - 1];
				term2 = ts2[k - 1];
				s2 = s2 + term1 - term2;
				s2_2 = s2_2 + term1 * term1 - term2 * term2;
				mu2 = s2 / sLen;
				sigma2 = s2_2 / sLen > mu2 * mu2 ? sqrt(s2_2 / sLen - mu2 * mu2) : 1;
				for (i = 0; i < numIters; i++){
					start = i * blockSize + tid;
					if (start < tsLen - sLen + 1){
						if (!start){
							elm_0 = buf_0[3 * k];
							elm_1 = buf_0[3 * k + 1];
							elm_2 = buf_0[3 * k + 2];
							dotPr = elm_0 + (double)elm_1 / 1e8 + (double)elm_2 / 1e16;
						}
						else{
							elm_0 = buf[3 * (start - 1)];
							elm_1 = buf[3 * (start - 1) + 1];
							elm_2 = buf[3 * (start - 1) + 2];
							term1 = ts1[start + sLen - 1];
							term2 = ts2[k + sLen - 1];
							dotPr = elm_0 + (double)elm_1 / 1e8 + (double)elm_2 / 1e16 + term1 * term2;
							term1 = ts1[start - 1];
							term2 = ts2[k - 1];
							dotPr -= term1 * term2;
						}
						if (i == 0)
							dotPr_0 = dotPr;
						else
							dotPr_1 = dotPr;
					}
					__syncthreads();
				}

				for (i = 0; i < numIters; i++){
					start = i * blockSize + tid;
					if (start < tsLen - sLen + 1){
						if (i == 0)
							dotPr = dotPr_0;
						else
							dotPr = dotPr_1;
						decomposeDouble(dotPr, elm_0, elm_1, elm_2);
						buf[3 * start] = elm_0;
						buf[3 * start + 1] = elm_1;
						buf[3 * start + 2] = elm_2;
					}
					__syncthreads();

					if (i == 0)
						nnCorr = nnCorr_0;
					else
						nnCorr = nnCorr_1;

					if (start < tsLen - sLen + 1){
						if (i == 0){
							mu1 = mu1_0;
							sigma1 = sigma1_0;
						}
						else{
							mu1 = mu1_1;
							sigma1 = sigma1_1;
						}
						corr = (dotPr - sLen * mu1 * mu2) / (sLen * sigma1 * sigma2);
						if (corr > nnCorr)
							nnCorr = corr;
					}
					if (i == 0)
						nnCorr_0 = nnCorr;
					else
						nnCorr_1 = nnCorr;
				}
			}

			for (i = 0; i < numIters; i++){
				start = i * blockSize + tid;
				if (i == 0)
					nnCorr = nnCorr_0;
				else
					nnCorr = nnCorr_1;
				if (start < tsLen - sLen + 1){
					shId = blockId * (tsLen - sLen + 1) + start;
					idx = j * numSh + shId;
					if (nnCorr > 1 || blockId == j)
						nnDist = 0;		
					else
						nnDist = sqrt(2 * (1 - nnCorr));
					nnDistsComplete[nextSh * numTrain + idx] = nnDist;	
				}
			}
		}
		__syncthreads();

		nextSh += numSh;
	}
}

I ran this kernel on a card with compute capability 5.0. I noticed that adding line 204, namely writing back to the global memory, seemed to significantly affect the running time. In one test case, running the kernel without line 204 took some 2 seconds. Adding it increased the running time to around 25 seconds. I checked the bandwidth and it turned out extremely low. I wonder if there is a way to improve the efficiency performance?

Apart from this, is there any other improvement I can implement to further boost the efficiency?

Thank you!

It’s often a misleading approach to do performance analysis simply be deleting lines of code, especially those that modify global state based on the results of kernel calculations.

This indicates a lack of understanding of how an aggressive optimizing compiler works. The compiler will detect that without your line of code, any calculations that fed that line of code (and nothing else) can be deleted from your kernel code during the compilation process.

So the reason that the execution time shrank from 25s to 2s is not that writing of global memory takes about 90% of the time of a CUDA kernel, but that the deletion of that line allowed the compiler to remove nearly all the code in your kernel. So that doesn’t really tell you anything useful.

Unless used very carefully, this is an unreliable technique. The reliable techniques (including analysis-driven optimization) for performance analysis and optimization is covered in a number of treatments online. If you google, for example:

cuda gtc optimization

you will find many quality tutorials on how to analyze and optimize CUDA code.