CUDA 9 slower than CUDA 8

This code on GTX 1050Ti gives ~48 fps (KernelSlow) and ~68 fps (KernelFast) [N = 15K]. On CUDA 9 I have
~25 and ~22 fps. On my GTX 1080 the problem is similar.

After compilation .exe file have size 1323 KB on CUDA 9 and ~330 KB on CUDA 8.
All settings in defaults.

//CUDA_Force_Calc
__global__ void KernelSlow(float *POSM, float *POSX, float *POSY, float *POSZ, float *POSR, const int N)
{
	float AX_M, AY_M, AZ_M, det_X, det_Y, det_Z, den;
	int first = threadIdx.x + blockIdx.x * blockDim.x;
	AX_M = 0;
	AY_M = 0;
	AZ_M = 0;
	for (int next = 0; next < N; next++) {
		if ((first != next) && ((POSX[next] != POSX[first]) || (POSY[next] != POSY[first]) || (POSZ[next] != POSZ[first]))) {
			det_X = POSX[next] - POSX[first];
			det_Y = POSY[next] - POSY[first];
			det_Z = POSZ[next] - POSZ[first];
			den = POSM[next] / (30000 * pow((det_X*det_X + det_Y*det_Y + det_Z*det_Z), 3 / 2));
			AX_M += det_X * den;
			AY_M += det_Y * den;
			AZ_M += det_Z * den;
		}
	}
	POSR[first] += AX_M;
	POSR[first + N] += AY_M;
	POSR[first + N + N] += AZ_M;

	POSX[first] += POSR[first];
	POSY[first] += POSR[first + N];
	POSZ[first] += POSR[first + N + N];
}

//CUDA_Tile_Calc
__global__ void KernelFast(float *POSM, float *POSX, float *POSY, float *POSZ, float *POSR, const int N)
{
	__shared__ float LPOSX[blocksize], LPOSY[blocksize], LPOSZ[blocksize], LPOSM[blocksize];

	float AX_M = 0, AY_M = 0, AZ_M = 0, den;
	int ia = blockDim.x * blockIdx.x + threadIdx.x;
	float POSX0 = POSX[ia], POSY0 = POSY[ia], POSZ0 = POSZ[ia], POSM0 = POSM[ia];

	for (unsigned int ib = 0; ib < N; ib += blocksize) {

		LPOSX[threadIdx.x] = POSX[ib + threadIdx.x];
		LPOSY[threadIdx.x] = POSY[ib + threadIdx.x];
		LPOSZ[threadIdx.x] = POSZ[ib + threadIdx.x];
		LPOSM[threadIdx.x] = POSM[ib + threadIdx.x];
		__syncthreads();

#pragma unroll
		for (unsigned int ic = 0; ic < blocksize; ic++) {
			if ((ic != ia) && (LPOSX[ic] != POSX0) || (LPOSY[ic] != POSY0) || (LPOSZ[ic] != POSZ0)){
			float det_X = (LPOSX[ic] - POSX0);
			float det_Y = (LPOSY[ic] - POSY0);
			float det_Z = (LPOSZ[ic] - POSZ0);
			den = LPOSM[ic] / (30000 * pow((det_X*det_X + det_Y*det_Y + det_Z*det_Z), 3 / 2));
			AX_M += det_X * den;
			AY_M += det_Y * den;
			AZ_M += det_Z * den;
			}
		}
		__syncthreads();
	}
	POSR[ia] += AX_M;
	POSR[ia + N] += AY_M;
	POSR[ia + N + N] += AZ_M;

	POSX[ia] += POSR[ia];
	POSY[ia] += POSR[ia + N];
	POSZ[ia] += POSR[ia + N + N];
}

FULL Source (VS 2015): NBody_CUDA.zip - Google Drive

Thanks.

Such large performance disparities are uncharacteristic of compiler bugs, though the possibility cannot be excluded. Double check that the builds with CUDA 8 and CUDA 9 are built with the exact same compiler options (in particular debug versus release build!). You can log the details of the compilation steps MSVS kicks off under the hood.

Make sure no environment variables impacting performance (such as CUDA_LAUNCH_BLOCKING) are set. Make sure no other applications using the GPU are running.

If after due diligence checks and apples-to-apples comparison (single variable change: CUDA version, all other hardware and software remains the same) you confirm this performance regression, I would suggest filing a bug with NVIDIA, using the bug reporting form reachable from the registered developer website.

BTW, something does not look quite correct here:

pow((det_X*det_X + det_Y*det_Y + det_Z*det_Z), 3 / 2)

3/2 is an integer division, and the result is 1, and pow(x,1) = x. I assume what you wanted there is pow (x, 1.5f)? If so, note that pow() is a low-throughput function, so you would want to use x*sqrt(x) instead, and then possibly use -use_fast_math to speed up the square root. If you re-arrange the math, you could also get rid of the division on that line.

The pointer arguments to these kernels should probably be restricted pointers for best performance. By adding the restrict modifier you tell the compiler that the pointer arguments are not aliased, allowing it to optimize more aggressively, in particular allowing more freedom in the re-ordering of loads and stores.

Use the CUDA profiler to find performance bottlenecks: the global memory access patterns don’t look optimal on first glance, and the efficiency of global memory access may be low.

Thanks, after “-use_fast_math” CUDA9 perfomance == CUDA 8, ~55 and 100 fps on GTX 1050Ti.

Note that -use_fast_math converts pow(float) into a very fast, but also very approximate device intrinsic. The suggested replacement x*sqrt(x) should be equally fast with -use_fast_math, but is much more accurate than pow (x,1.5f).

Сode now:

//CUDA_Force_Calc
__global__ void KernelSlow(float *POSM, float *POSX, float *POSY, float *POSZ, float *POSR, const int N, const int grav, const float EPS)
{
	float AX_M, AY_M, AZ_M, det_X, det_Y, det_Z, den, norm, ext;
	int first = threadIdx.x + blockIdx.x * blockDim.x;
	AX_M = 0;
	AY_M = 0;
	AZ_M = 0;
	for (int next = 0; next < N; next++) {
			det_X = POSX[next] - POSX[first];
			det_Y = POSY[next] - POSY[first];
			det_Z = POSZ[next] - POSZ[first];
			norm = det_X*det_X + det_Y*det_Y + det_Z*det_Z;
			ext = norm * norm + EPS * EPS;
			den = grav * POSM[next] / (sqrtf(ext) * ext);
			AX_M += det_X * den;
			AY_M += det_Y * den;
			AZ_M += det_Z * den;		
	}
	POSR[first] += AX_M;
	POSR[first + N] += AY_M;
	POSR[first + N + N] += AZ_M;

	POSX[first] += POSR[first];
	POSY[first] += POSR[first + N];
	POSZ[first] += POSR[first + N + N];
}

//CUDA_Tile_Calc
__global__ void KernelFast(float *POSM, float *POSX, float *POSY, float *POSZ, float *POSR, const int N, const int grav, const float EPS)
{
	__shared__ float LPOSX[blocksize], LPOSY[blocksize], LPOSZ[blocksize], LPOSM[blocksize];

	float AX_M = 0, AY_M = 0, AZ_M = 0, den, norm, ext;
	int ia = blockDim.x * blockIdx.x + threadIdx.x;
	float POSX0 = POSX[ia], POSY0 = POSY[ia], POSZ0 = POSZ[ia], POSM0 = POSM[ia];

	for (unsigned int ib = 0; ib < N; ib += blocksize) {

		LPOSX[threadIdx.x] = POSX[ib + threadIdx.x];
		LPOSY[threadIdx.x] = POSY[ib + threadIdx.x];
		LPOSZ[threadIdx.x] = POSZ[ib + threadIdx.x];
		LPOSM[threadIdx.x] = POSM[ib + threadIdx.x];
		__syncthreads();

#pragma unroll
		for (unsigned int ic = 0; ic < blocksize; ic++) {			
			float det_X = (LPOSX[ic] - POSX0);
			float det_Y = (LPOSY[ic] - POSY0);
			float det_Z = (LPOSZ[ic] - POSZ0);
			norm = det_X*det_X + det_Y*det_Y + det_Z*det_Z;
			ext = norm * norm + EPS * EPS;
			den = grav * LPOSM[ic] / (sqrtf(ext) * ext);
			AX_M += det_X * den;
			AY_M += det_Y * den;
			AZ_M += det_Z * den;			
		}
		__syncthreads();
	}
	POSR[ia] += AX_M;
	POSR[ia + N] += AY_M;
	POSR[ia + N + N] += AZ_M;

	POSX[ia] += POSR[ia];
	POSY[ia] += POSR[ia + N];
	POSZ[ia] += POSR[ia + N + N];
}

GTX 1050Ti (Force) ~ 105 fps; (Tile) ~ 192 fps; i5-3450 (OpenMP) ~ 2 fps.

Thanks again.

You may want to try whether restricted pointers can improve the load/store handling:

__global__ void KernelSlow(float * __restrict__ POSM, float * __restrict_ POSX, float * __restrict__ POSY, float * __restrict__  POSZ, float * __restrict__ POSR, const int N, const int grav, const float EPS)

Usually it is better to use rsqrt() instead of sqrt() where applicable. It is not clear whether that would buy us something here, but it seems worth a try:

den = grav * POSM[next] * rsqrtf (ext) * (1.0f / ext);

restrict and rsqrtf() working without changes on performance.