Wrong results with cuda 3.1

Hello everyone,

I recently installed the new 3.1 drivers and the cuda 3.1 toolkit. My code (CFD algorithm) used to compile and give correct answers with 3.0. However, with Cuda 3.1, the code still compiles without any problems but produces wrong results. I was able to run the SDK examples on 3.1 without issues.

While trying to investigate where the problem was coming from, I noticed that by manually unrolling a “for” loop inside a kernel, the code would produce correct results. Here are the two variants of a portion of the code I used, one giving the correct answer, the other not. Unless I miss something really obvious, these two codes should do the exact same thing, but they give different results. The rest of the code is EXACTLY the same.

The code that doesn’t give the correct results with Cuda 3.1 but works with Cuda 3.0:

for(int j = 0; j < 4; j++)

								{

										if(j == 0)

										{

												fl_temp = norm_x*(rho_l*vx_l) + norm_y*(rho_l*vy_l);

												fr_temp = norm_x*(rho_r*vx_r) + norm_y*(rho_r*vy_r);

												ql_temp = rho_l;

												qr_temp = rho_r;

										} else if(j == 1) {

												fl_temp = norm_x*(p_l+(rho_l*vx_l*vx_l)) + norm_y*(rho_l*vy_l*vx_l);

												fr_temp = norm_x*(p_r+(rho_r*vx_r*vx_r)) + norm_y*(rho_r*vy_r*vx_r); 

												ql_temp = rho_l*vx_l;

												qr_temp = rho_r*vx_r;

										} else if(j == 2) {

												fl_temp = norm_x*(rho_l*vx_l*vy_l) + norm_y*(p_l+(rho_l*vy_l*vy_l));

												fr_temp = norm_x*(rho_r*vx_r*vy_r) + norm_y*(p_r+(rho_r*vy_r*vy_r));

												ql_temp = rho_l*vy_l;

												qr_temp = rho_r*vy_r;

										} else {

												fl_temp = norm_x*(vx_l*(energy_l + p_l)) + norm_y*(vy_l*(energy_l + p_l));

												fr_temp = norm_x*(vx_r*(energy_r + p_r)) + norm_y*(vy_r*(energy_r + p_r));

												ql_temp = energy_l;

												qr_temp = energy_r;

										}

										// Computing common normal flux	

										fg_norm  = 0.5*((fl_temp+fr_temp)

														-(vn_av_mag+c_av)*(qr_temp-ql_temp));

										

										// Store transformed normal flux for left flux point	

										d_norm_tfg_con_ed_fgpts[index_l] = 

												fg_norm*d_mag_norm_dot_jac_inv_ed_fgpts[index2_l]; index_l+= n_ed_fgpts_per_cell;

										// Store transformed normal flux for right flux point   

										d_norm_tfg_con_ed_fgpts[index_r] =

												-fg_norm*d_mag_norm_dot_jac_inv_ed_fgpts[index2_r]; index_r += n_ed_fgpts_per_cell;

								}

The code that works fine with CUDA 3.1 and 3.0

// ------------------------------

								// j=0

								// ------------------------------

								fl_temp = norm_x*(rho_l*vx_l) + norm_y*(rho_l*vy_l);

								fr_temp = norm_x*(rho_r*vx_r) + norm_y*(rho_r*vy_r);

								ql_temp = rho_l;

								qr_temp = rho_r;

								// Computing common normal flux	 

								fg_norm  = 0.5*((fl_temp+fr_temp)

												-(vn_av_mag+c_av)*(qr_temp-ql_temp));

								// Store transformed normal flux for left flux point	

								d_norm_tfg_con_ed_fgpts[index_l] =

										fg_norm*d_mag_norm_dot_jac_inv_ed_fgpts[index2_l]; index_l += n_ed_fgpts_per_cell;

								// Store transformed normal flux for right flux point   

								d_norm_tfg_con_ed_fgpts[index_r] =

										-fg_norm*d_mag_norm_dot_jac_inv_ed_fgpts[index2_r]; index_r += n_ed_fgpts_per_cell;

								// ------------------------	 

								// j=1

								// ------------------------

								fl_temp = norm_x*(p_l+(rho_l*vx_l*vx_l)) + norm_y*(rho_l*vy_l*vx_l);

								fr_temp = norm_x*(p_r+(rho_r*vx_r*vx_r)) + norm_y*(rho_r*vy_r*vx_r);

								ql_temp = rho_l*vx_l;

								qr_temp = rho_r*vx_r;

								// Computing common normal flux   

								fg_norm  = 0.5*((fl_temp+fr_temp)

												-(vn_av_mag+c_av)*(qr_temp-ql_temp));

								// Store transformed normal flux for left flux point	

								d_norm_tfg_con_ed_fgpts[index_l] =

										fg_norm*d_mag_norm_dot_jac_inv_ed_fgpts[index2_l]; index_l += n_ed_fgpts_per_cell;

								// Store transformed normal flux for right flux point   

								d_norm_tfg_con_ed_fgpts[index_r] =

										-fg_norm*d_mag_norm_dot_jac_inv_ed_fgpts[index2_r]; index_r += n_ed_fgpts_per_cell;

								// ------------------------

								// j=2

								// ------------------------

								fl_temp = norm_x*(rho_l*vx_l*vy_l) + norm_y*(p_l+(rho_l*vy_l*vy_l));

								fr_temp = norm_x*(rho_r*vx_r*vy_r) + norm_y*(p_r+(rho_r*vy_r*vy_r));

								ql_temp = rho_l*vy_l;

								qr_temp = rho_r*vy_r;

								// Computing common normal flux	

								fg_norm  = 0.5*((fl_temp+fr_temp)

												-(vn_av_mag+c_av)*(qr_temp-ql_temp));

								// Store transformed normal flux for left flux point	

								d_norm_tfg_con_ed_fgpts[index_l] =

										fg_norm*d_mag_norm_dot_jac_inv_ed_fgpts[index2_l]; index_l += n_ed_fgpts_per_cell;

								// Store transformed normal flux for right flux point   

								d_norm_tfg_con_ed_fgpts[index_r] =

										-fg_norm*d_mag_norm_dot_jac_inv_ed_fgpts[index2_r]; index_r += n_ed_fgpts_per_cell;

								// ------------------------

								// j=3

								// ------------------------

								fl_temp = norm_x*(vx_l*(energy_l + p_l)) + norm_y*(vy_l*(energy_l + p_l));

								fr_temp = norm_x*(vx_r*(energy_r + p_r)) + norm_y*(vy_r*(energy_r + p_r));

								ql_temp = energy_l;

								qr_temp = energy_r;

								// Computing common normal flux 

								fg_norm  = 0.5*((fl_temp+fr_temp)

												-(vn_av_mag+c_av)*(qr_temp-ql_temp));

								// Store transformed normal flux for left flux point	

								d_norm_tfg_con_ed_fgpts[index_l] =

										fg_norm*d_mag_norm_dot_jac_inv_ed_fgpts[index2_l];

								// Store transformed normal flux for right flux point   

								d_norm_tfg_con_ed_fgpts[index_r] =

										-fg_norm*d_mag_norm_dot_jac_inv_ed_fgpts[index2_r];

Am I missing something really obvious? Also, the kernel that works fine uses 68 registers, while the version that doesn’t work on 3.1 uses only 54 registers. I would really appreciate your help.

Hi,

Are you using the same machine for sdk 3.0 and sdk 3.1 ? (are you using 3.1 with a fermi card?!)

Hi,

Are you using the same machine for sdk 3.0 and sdk 3.1 ? (are you using 3.1 with a fermi card?!)

Hi,

There appears to be something weird with your non-working code. You calculate the temperatures in 4 different ways (loopvar j) and then compute the same fluxes without using j as far as i can see. Therefore, normally, you would get the fluxes calculated with j=3. The compiler tries to optimize loops and conditions away, and perhaps the way this is performed leads to different code under 3.1 then under 3.0.

Point is, i think, that you calculate all fluxes for each value of j.

Your working code does the job for the compiler and moreover, j there apparently also indicates which fluxes are to be calculated.

I would say, stick with the second code…

So in the end I am very surprised that the first code worked under Cuda 3.0.

Please tell me if I am right here or … am missing something really obvious …

Hi,

There appears to be something weird with your non-working code. You calculate the temperatures in 4 different ways (loopvar j) and then compute the same fluxes without using j as far as i can see. Therefore, normally, you would get the fluxes calculated with j=3. The compiler tries to optimize loops and conditions away, and perhaps the way this is performed leads to different code under 3.1 then under 3.0.

Point is, i think, that you calculate all fluxes for each value of j.

Your working code does the job for the compiler and moreover, j there apparently also indicates which fluxes are to be calculated.

I would say, stick with the second code…

So in the end I am very surprised that the first code worked under Cuda 3.0.

Please tell me if I am right here or … am missing something really obvious …

@pavanki: Yes, the same machine, with the latest driver (3.1)

@jan: I’m not sure I understand what you’re trying to say. The values of ql_temp,qr_temp,fl_temp and fr_temp depend on the value of “j”. So depending on what “j” is, the value of fg_norm will be different for each “j” (because it depends on ql_temp, qr_temp, etc…)
I wouldn’t mind sticking with the second code, but the increase in the number of registers bothers me. Also, I could store the values of ql_temp,qr_temp,fl_temp and fr_temp for each value of “j” but this would also lead to an increase in the number of registers. It’s seems like both versions of the code should do exactly the same thing, and that’s what happen with 3.0. Why wouldn’t it work with 3.1? Bug in the compiler?

@pavanki: Yes, the same machine, with the latest driver (3.1)

@jan: I’m not sure I understand what you’re trying to say. The values of ql_temp,qr_temp,fl_temp and fr_temp depend on the value of “j”. So depending on what “j” is, the value of fg_norm will be different for each “j” (because it depends on ql_temp, qr_temp, etc…)
I wouldn’t mind sticking with the second code, but the increase in the number of registers bothers me. Also, I could store the values of ql_temp,qr_temp,fl_temp and fr_temp for each value of “j” but this would also lead to an increase in the number of registers. It’s seems like both versions of the code should do exactly the same thing, and that’s what happen with 3.0. Why wouldn’t it work with 3.1? Bug in the compiler?

@(pavanki): Running on a Tesla C1060

@(pavanki): Running on a Tesla C1060

Sorry, I have overlooked the increments of index_l and index_r. It seemed to me that the same values were calculated repeatedly. I was wrong there. Your code should work. Maybe my hunch that the compiler unrolls the loop in a wrong manner or tries to optimize the conditions away is a start. I’ll try to figure it out.

But you can check by putting

#pragma unroll 1

in front of the loop to prevent unrolling (see programming guide appendix E.2)

Sorry, I have overlooked the increments of index_l and index_r. It seemed to me that the same values were calculated repeatedly. I was wrong there. Your code should work. Maybe my hunch that the compiler unrolls the loop in a wrong manner or tries to optimize the conditions away is a start. I’ll try to figure it out.

But you can check by putting

#pragma unroll 1

in front of the loop to prevent unrolling (see programming guide appendix E.2)

[quote name=‘pcasto’ post=‘1103952’ date=‘Aug 15 2010, 07:57 AM’]

Hello everyone,

I recently installed the new 3.1 drivers and the cuda 3.1 toolkit. My code (CFD algorithm) used to compile and give correct answers with 3.0. However, with Cuda 3.1, the code still compiles without any problems but produces wrong results. I was able to run the SDK examples on 3.1 without issues.

While trying to investigate where the problem was coming from, I noticed that by manually unrolling a “for” loop inside a kernel, the code would produce correct results. Here are the two variants of a portion of the code I used, one giving the correct answer, the other not. Unless I miss something really obvious, these two codes should do the exact same thing, but they give different results. The rest of the code is EXACTLY the same.

The code that doesn’t give the correct results with Cuda 3.1 but works with Cuda 3.0:

[codebox]#include "cutil_inline.h"

#define N 200

#define M 100

#define K 4 // K !=4 , then adapt the loops in kernel and do_loop() routine

#define SIZE (NMK)

int Values[M*N],Result1,Result2,Check1,Check2[

SIZE];

global void loopkernel(int *result1, int *result2, int *values, int *devdebugs)

{

int val, norm;

int srcindex = blockIdx.x * blockDim.x;

int dstindex = blockIdx.x * blockDim.x + threadIdx.x;

int stride   = blockDim.x * gridDim.x;

// devdebugs can be useful to inspect values when (host)debugging.

// too lazy to use parallel nsight right now.

// trivial example:

// if (dstindex == 0)

// {

//	 devdebugs[0]=blockDim.x;

//	 devdebugs[1]=gridDim.x;

// }

#pragma UNROLL 4 // testing several values for UNROLL. 1 means no unrolling

for(int j = 0; j < K; j++)

{

	if(j == 0)

	{

		val=values[srcindex + 3] + dstindex;

	}

	else if(j == 1)

	{

		val=values[srcindex + 2] + dstindex;

	}

	else if(j == 2)

	{

		val=values[srcindex + 1] + dstindex;

	}

	else

	{

		val=values[srcindex + 0] + dstindex;

	}

	// Computing something

	norm  = val*val;

	// Store the result in array 1

	result1[dstindex] = norm;

	// Store another result in array 2

	result2[dstindex] = norm * norm;

	dstindex += stride;

}

}

int do_loop(int *c1, int *c2, int *v, int block, int bthr)

{

int maxindex=0;

int val, norm;

int srcindex = block * N ;

int dstindex  = block * N + bthr;

int stride = M*N;

for(int j = 0; j < K; j++)

{

	if(j == 0)

	{

		val=v[srcindex + 3] + dstindex;

	}

	else if(j == 1)

	{

		val=v[srcindex + 2] + dstindex;

	}

	else if(j == 2)

	{

		val=v[srcindex + 1] + dstindex;

	}

	else

	{

		val=v[srcindex + 0] + dstindex;

	}

	// Computing something

	norm  = val*val;

	// Store the result in array 1

	c1[dstindex] = norm;

	maxindex=max(maxindex,dstindex);

	if (dstindex>=SIZE) puts("Out of bounds");

	// Store the result in array 2

	c2[dstindex] = norm*norm; dstindex+=stride;

}

return maxindex;

}

int looptest_cpu(int *c1,int *c2,int *v)

{

int maxindex=0;

for (int block=0;block<M;block++)

	for (int bthr=0;bthr<N;bthr++)

	{

		int n=do_loop(c1,c2,v,block,bthr);

		maxindex=max(maxindex,n);

	}

return maxindex;

}

int check(int *a, int *b, int n)

{

for (;--n>-0;) if (a[n]!=b[n]) return 0;

return 1;

}

int main() {

int *values,*result1,*result2;

int *debugs,*devdebugs;

   cudaDeviceProp deviceProp;

printf("Test for researching a possible bug in CUDA.\n"

		"The kernel uses a loop to calculate %d values\n"

		"and writes these out to a global array, the index\n"

		"of which is incremented in the loop.\n"

		"The results are checked.\n"

		"Arraysize is %d x %d\n"

		"Because the stride is %d, the result arrays contain %d entries.\n\n",K,M,N,M*N,SIZE);

cudaSetDevice(0);

    cutilSafeCall(cudaGetDeviceProperties(&deviceProp, 0));

    printf("Use device %d: \"%s\"\n", 0, deviceProp.name);

cutilSafeCall(cudaMalloc((void**)&values, M*N*sizeof(int)));

    cutilSafeCall(cudaMalloc((void**)&result1, SIZE*sizeof(int)));

    cutilSafeCall(cudaMalloc((void**)&result2, SIZE*sizeof(int)));

    cutilSafeCall(cudaHostAlloc((void**)&debugs, 100*sizeof(int),cudaHostAllocMapped));

cudaHostGetDevicePointer((void**)&devdebugs,debugs,0);

for (int n=0; n < M*N; n++) Values[n]=n;

puts("Copy array to GPU");

cudaMemcpy(values, Values, M*N*sizeof(int), cudaMemcpyHostToDevice);

puts("Run kernel on GPU");

loopkernel<<<M,N>>>(result1, result2, values, devdebugs);

    cutilCheckMsg("Kernel execution failed");

cudaThreadSynchronize();

puts("Copy results to CPU");

cudaMemcpy(Result1, result1, SIZE*sizeof(int), cudaMemcpyDeviceToHost);

cudaMemcpy(Result2, result2, SIZE*sizeof(int), cudaMemcpyDeviceToHost);

cudaFree((void*)values);

    cudaFree((void*)result1);

    cudaFree((void*)result2);

cudaFreeHost(debugs);

puts("Run calculation on CPU");

int maxindex = looptest_cpu(Check1,Check2,Values);

puts("Compare results");

if (!check(Result1, Check1, SIZE)) puts("Check on Result1 failed");

else if (!check(Result2, Check2, SIZE)) puts("Check on Result2 failed");

else puts("Passed");

if (maxindex != SIZE-1) printf("Maxindex %d of %d\n",maxindex, SIZE);

return 0;

}[/codebox]

[quote name=‘pcasto’ post=‘1103952’ date=‘Aug 15 2010, 07:57 AM’]

Hello everyone,

I recently installed the new 3.1 drivers and the cuda 3.1 toolkit. My code (CFD algorithm) used to compile and give correct answers with 3.0. However, with Cuda 3.1, the code still compiles without any problems but produces wrong results. I was able to run the SDK examples on 3.1 without issues.

While trying to investigate where the problem was coming from, I noticed that by manually unrolling a “for” loop inside a kernel, the code would produce correct results. Here are the two variants of a portion of the code I used, one giving the correct answer, the other not. Unless I miss something really obvious, these two codes should do the exact same thing, but they give different results. The rest of the code is EXACTLY the same.

The code that doesn’t give the correct results with Cuda 3.1 but works with Cuda 3.0:

[codebox]#include "cutil_inline.h"

#define N 200

#define M 100

#define K 4 // K !=4 , then adapt the loops in kernel and do_loop() routine

#define SIZE (NMK)

int Values[M*N],Result1,Result2,Check1,Check2[

SIZE];

global void loopkernel(int *result1, int *result2, int *values, int *devdebugs)

{

int val, norm;

int srcindex = blockIdx.x * blockDim.x;

int dstindex = blockIdx.x * blockDim.x + threadIdx.x;

int stride   = blockDim.x * gridDim.x;

// devdebugs can be useful to inspect values when (host)debugging.

// too lazy to use parallel nsight right now.

// trivial example:

// if (dstindex == 0)

// {

//	 devdebugs[0]=blockDim.x;

//	 devdebugs[1]=gridDim.x;

// }

#pragma UNROLL 4 // testing several values for UNROLL. 1 means no unrolling

for(int j = 0; j < K; j++)

{

	if(j == 0)

	{

		val=values[srcindex + 3] + dstindex;

	}

	else if(j == 1)

	{

		val=values[srcindex + 2] + dstindex;

	}

	else if(j == 2)

	{

		val=values[srcindex + 1] + dstindex;

	}

	else

	{

		val=values[srcindex + 0] + dstindex;

	}

	// Computing something

	norm  = val*val;

	// Store the result in array 1

	result1[dstindex] = norm;

	// Store another result in array 2

	result2[dstindex] = norm * norm;

	dstindex += stride;

}

}

int do_loop(int *c1, int *c2, int *v, int block, int bthr)

{

int maxindex=0;

int val, norm;

int srcindex = block * N ;

int dstindex  = block * N + bthr;

int stride = M*N;

for(int j = 0; j < K; j++)

{

	if(j == 0)

	{

		val=v[srcindex + 3] + dstindex;

	}

	else if(j == 1)

	{

		val=v[srcindex + 2] + dstindex;

	}

	else if(j == 2)

	{

		val=v[srcindex + 1] + dstindex;

	}

	else

	{

		val=v[srcindex + 0] + dstindex;

	}

	// Computing something

	norm  = val*val;

	// Store the result in array 1

	c1[dstindex] = norm;

	maxindex=max(maxindex,dstindex);

	if (dstindex>=SIZE) puts("Out of bounds");

	// Store the result in array 2

	c2[dstindex] = norm*norm; dstindex+=stride;

}

return maxindex;

}

int looptest_cpu(int *c1,int *c2,int *v)

{

int maxindex=0;

for (int block=0;block<M;block++)

	for (int bthr=0;bthr<N;bthr++)

	{

		int n=do_loop(c1,c2,v,block,bthr);

		maxindex=max(maxindex,n);

	}

return maxindex;

}

int check(int *a, int *b, int n)

{

for (;--n>-0;) if (a[n]!=b[n]) return 0;

return 1;

}

int main() {

int *values,*result1,*result2;

int *debugs,*devdebugs;

   cudaDeviceProp deviceProp;

printf("Test for researching a possible bug in CUDA.\n"

		"The kernel uses a loop to calculate %d values\n"

		"and writes these out to a global array, the index\n"

		"of which is incremented in the loop.\n"

		"The results are checked.\n"

		"Arraysize is %d x %d\n"

		"Because the stride is %d, the result arrays contain %d entries.\n\n",K,M,N,M*N,SIZE);

cudaSetDevice(0);

    cutilSafeCall(cudaGetDeviceProperties(&deviceProp, 0));

    printf("Use device %d: \"%s\"\n", 0, deviceProp.name);

cutilSafeCall(cudaMalloc((void**)&values, M*N*sizeof(int)));

    cutilSafeCall(cudaMalloc((void**)&result1, SIZE*sizeof(int)));

    cutilSafeCall(cudaMalloc((void**)&result2, SIZE*sizeof(int)));

    cutilSafeCall(cudaHostAlloc((void**)&debugs, 100*sizeof(int),cudaHostAllocMapped));

cudaHostGetDevicePointer((void**)&devdebugs,debugs,0);

for (int n=0; n < M*N; n++) Values[n]=n;

puts("Copy array to GPU");

cudaMemcpy(values, Values, M*N*sizeof(int), cudaMemcpyHostToDevice);

puts("Run kernel on GPU");

loopkernel<<<M,N>>>(result1, result2, values, devdebugs);

    cutilCheckMsg("Kernel execution failed");

cudaThreadSynchronize();

puts("Copy results to CPU");

cudaMemcpy(Result1, result1, SIZE*sizeof(int), cudaMemcpyDeviceToHost);

cudaMemcpy(Result2, result2, SIZE*sizeof(int), cudaMemcpyDeviceToHost);

cudaFree((void*)values);

    cudaFree((void*)result1);

    cudaFree((void*)result2);

cudaFreeHost(debugs);

puts("Run calculation on CPU");

int maxindex = looptest_cpu(Check1,Check2,Values);

puts("Compare results");

if (!check(Result1, Check1, SIZE)) puts("Check on Result1 failed");

else if (!check(Result2, Check2, SIZE)) puts("Check on Result2 failed");

else puts("Passed");

if (maxindex != SIZE-1) printf("Maxindex %d of %d\n",maxindex, SIZE);

return 0;

}[/codebox]

Thanks Jan, I will take a look at your code and try to replicate the problem. Maybe it’s a bug somewhere else in my code that only shows up when I use the for loop, although I can’t see what could be causing this.

Thanks Jan, I will take a look at your code and try to replicate the problem. Maybe it’s a bug somewhere else in my code that only shows up when I use the for loop, although I can’t see what could be causing this.

I tried the #pragma unroll 1 option but no luck

I tried the #pragma unroll 1 option but no luck