Very strange behaviour. Maybe a bug...? Kernel fails to run strangely, but no errors are reported.

Carmelo · May 12, 2009, 10:18pm

----- Synopsis description of the problem -----

Kernel fails to run strangely.

----- Detailed description of the problem -----

Kernel fails to run:

return data are incorrect (tipically zeros),

kernel execution time is almost zero,

but no errors are reported.

Kernel works incorrectly when

the following operation is performed:

[codebox]// UNCOMMENT THE FOLLOWING LINE TO "SOLVE" THE PROBLEM

//#define WORKAROUND_ON

#include <cuda_runtime.h>

#include <stdio.h>

#include <cutil_inline.h>

#define em2(xmask, k) ((float) ( (((xmask) >> (k)) & 3) - 1))

#define BIT(PROG, IND) (((PROG) >> (IND) ) & 1)

#define drand (((double) rand()) / RAND_MAX)

long int NTHREADS;

unsigned char outputCouples2[64];

device void d_vprod(float4 *vinp1, float4 *vinp2, float4 *vout1, float4 *vout2, int ien2)

{

float4 in1 = *vinp1, in2 = *vinp2;

#ifdef WORKAROUND_ON

float bug1, bug2;

bug1 = em2(ien2,28);

bug2 = in1.w*in2.z;

#endif

vout1->x = em2(ien2,6)(in1.x)(in2.w) + em2(ien2,24)(in1.w)(in2.x);

vout1->y = em2(ien2,14)*(in1.y)*(in2.w) + em2(ien2,26)*(in1.w)*(in2.y);

#ifdef WORKAROUND_ON

vout1->z = em2(ien2,22)*(in1.z)*(in2.w) + bug1*bug2;

#else

vout1->z = em2(ien2,22)*(in1.z)*(in2.w) + em2(ien2,28)*in1.w*in2.z;

#endif

vout1->w = em2(ien2,20)*(in1.z)*(in2.z) + em2(ien2,30)*(in1.w)*(in2.w); //scalar

vout2->x = em2(ien2,12)(in1.y)(in2.z) + em2(ien2,18)(in1.z)(in2.y);

vout2->y = em2(ien2,4)*(in1.x)*(in2.z) + em2(ien2,16)*(in1.z)*(in2.x);

vout2->z = em2(ien2,2)*(in1.x)*(in2.y) + em2(ien2,8)*(in1.y)*(in2.x);

vout2->w = em2(ien2,0)*(in1.x)*(in2.x) + em2(ien2,10)*(in1.y)*(in2.y); //scalar

vout1->w += vout2->w;

vout2->w = 0;

}

global void d_gprod(unsigned short code, float4 *vinp1, float4 *vinp2, float4 *vout1, float4 *vout2,

                    unsigned char* outputCouples2, unsigned char* outtype, int ien2)

{

const unsigned long long outswaps = 0x55aa55aa55aa55aaLL;

unsigned short code0;

float4 *voutS, *voutP;

unsigned short swapon;

int tid = blockIdx.x*512+threadIdx.x;

code0 = code & 0x3f;

swapon = (unsigned short) BIT(outswaps, code0);

if (swapon) {

    voutS = vout2+tid;

    voutP = vout1+tid;

} else {

    voutS = vout1+tid;

    voutP = vout2+tid;

}

d_vprod(vinp1, vinp2+tid, voutS, voutP, ien2);

*(outtype+tid) = outputCouples2[code0];

}

int main(int argc, char** argv)

{

// INIT

NTHREADS = 512*512;

for(int i=0; i<=63; i++) outputCouples2[i]=0;

// HOST MEMORY ALLOCATION

unsigned short codein;

float4 qin1;

float4* qin2 = (float4*) malloc(NTHREADS*sizeof(float4));

float4* qout1 = (float4*) malloc(NTHREADS*sizeof(float4));

float4* qout2 = (float4*) malloc(NTHREADS*sizeof(float4));

unsigned char* codeout = (unsigned char*) malloc(NTHREADS*sizeof(unsigned char));

// DEVICE MEMORY ALLOCATION

float4* d_qin1; cutilSafeCall(cudaMalloc((void**)&d_qin1,sizeof(float4)));

float4* d_qin2; cutilSafeCall(cudaMalloc((void**)&d_qin2,NTHREADS*sizeof(float4)));

float4* d_qout1; cutilSafeCall(cudaMalloc((void**)&d_qout1,NTHREADS*sizeof(float4)));

float4* d_qout2; cutilSafeCall(cudaMalloc((void**)&d_qout2,NTHREADS*sizeof(float4)));

unsigned char* d_outputCouples2; cutilSafeCall(cudaMalloc((void**)&d_outputCouples2,64*sizeof(unsigned char)));

unsigned char* d_codeout; cutilSafeCall(cudaMalloc((void**)&d_codeout,NTHREADS*sizeof(unsigned char)));

// DATA TRANSFER TO DEVICE

cutilSafeCall(cudaMemcpy(d_outputCouples2,outputCouples2,64*

sizeof(unsigned char),cudaMemcpyHostToDevice));

// TIMER INIT

double gpu_kernel_time = 0, gpu_total_time = 0;

unsigned int hTimer, hTimer2;

// OPERANDS INIT

qin1.x = drand; qin1.y = drand; qin1.z = drand; qin1.w = drand;

for (int i=0; i<NTHREADS; i++)

{

  qin2[i].x = drand; qin2[i].y=drand; qin2[i].z = drand; qin2[i].w = drand;

}

codein = 0;

// TIMER START (KERNEL + DATA TRANSFER)

cutilCheckError( cutCreateTimer(&hTimer2) );

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError( cutResetTimer(hTimer2) );

cutilCheckError( cutStartTimer(hTimer2) );

// DATA TRANSFER TO DEVICE

cutilSafeCall(cudaMemcpy(d_qin2,qin2,NTHREADS*sizeof(float4)

,cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(d_qin1,&qin1,sizeof(float4),cudaMemcpyHostToDevice));

// TIMER START (KERNEL ONLY)

cutilCheckError( cutCreateTimer(&hTimer) );

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError( cutResetTimer(hTimer) );

cutilCheckError( cutStartTimer(hTimer) );

// COMPUTATION ON DEVICE (KERNEL EXECUTION)

dim3 dimBlock(((NTHREADS<=512)?NTHREADS:512),1);

dim3 dimGrid(((NTHREADS<=512)?1:NTHREADS/512),1);

d_gprod<<<dimGrid,dimBlock>>>(codein, d_qin1, d_qin2, d_qout1, d_qout2, d_outputCouples2, d_codeout, 0);

// TIMER END (KERNEL ONLY)

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError( cutStopTimer(hTimer) );

gpu_kernel_time = cutGetTimerValue(hTimer);

// DATA TRANSFER TO HOST

cutilSafeCall(cudaMemcpy(qout1,d_qout1,NTHREADS*sizeof(float

4),cudaMemcpyDeviceToHost));

cutilSafeCall(cudaMemcpy(qout2,d_qout2,NTHREADS*sizeof(float

4),cudaMemcpyDeviceToHost));

cutilSafeCall(cudaMemcpy(codeout,d_codeout,NTHREADS*sizeof(u

nsigned char),cudaMemcpyDeviceToHost));

// TIMER END (KERNEL + DATA TRANSFER)

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError( cutStopTimer(hTimer2) );

gpu_total_time = cutGetTimerValue(hTimer2);

// SHOW FIRST 3 RESULTS

for(int i=0; i<=2; i++)

{

  printf("--- Operation n. %d ---\n", i+1);

  printf("(%.2f, %.2f, %.2f, %.2f) GP ", qin1.x,  qin1.y,  qin1.z,  qin1.w);

  printf("(%.2f, %.2f, %.2f, %.2f) = ", qin2[i].x,  qin2[i].y,  qin2[i].z,  qin2[i].w);

  printf("(%.2f, %.2f, %.2f, %.2f) + ", qout1[i].x, qout1[i].y, qout1[i].z, qout1[i].w);

  printf("(%.2f, %.2f, %.2f, %.2f)\n", qout2[i].x, qout2[i].y, qout2[i].z, qout2[i].w);

  //printf("Output code: %d \n", codeout[i]);

}

// SHOW TOTAL TIME

printf(“\nNumber of threads: %li\n”,NTHREADS);

printf(“GPU kernel time: %lf msec\n”,gpu_kernel_time);

printf(“GPU total time: %lf msec\n”,gpu_total_time);

return 0;

}[/codebox]

tmurray · May 12, 2009, 10:33pm

Stop using cutil. It should not be used by anyone and is absolutely not a substitute for doing robust error checking yourself. It’s a convenience wrapper for sample code, nothing more. (note to self: hide TMURRAY_WILL_HUNT_YOU_DOWN in common.mk, wrap big #ifdefs in cutil.h so people can’t use it unless they actually look at it and therefore realize that it is not something they should be using)
Do timing with CUDA events instead of CPU side timers.
You’re not doing error checking correctly. Most likely you’re not actually launching the kernel. To check errors correctly in the runtime API, you actually have to do:

kernel<<<...>>>();

if (cudaGetLastError() != cudaSuccess)

  {

	printf("Kernel did not launch successfully\n");

	..

  }

cudaThreadSynchronize();

if (cudaGetLastError() != cudaSuccess)

  {

	printf("Kernel did not complete successfully\n");

	..

  }

Carmelo · May 13, 2009, 12:35pm

First of all, thank you for the prompt answer.

Stop using cutil.

Ok. Done.

Do timing with CUDA events instead of CPU side timers.

Ok. Done.

You’re not doing error checking correctly.

Most likely you’re not actually launching the kernel.

To check errors correctly in the runtime API, you actually have to do:

[…]

Ok. Done.

The problem persists.

The kernel fails to launch, reporting error 7:

“too many resources requested for launch”.

It doesn’t make sense,

because kernel launch fails, for example,

with only 449 threads on configuration -C-…

but it works correctly with over 4 million threads

with the workaround described above

where I use two more float4 variables.

Is there any explanation?

Complete updated code follows:

[codebox]

// UNCOMMENT THE FOLLOWING LINE TO “SOLVE” THE PROBLEM

//define WORKAROUND_ON

include <cuda_runtime.h>

include <stdio.h>

define em2(xmask, k) ((float) ( (((xmask) >> (k)) & 3) - 1))

define BIT(PROG, IND) (((PROG) >> (IND) ) & 1)

define drand (((double) rand()) / RAND_MAX)

define checkError(err) __checkError(err, FILE, LINE)

long int NTHREADS;

unsigned char outputCouples2[64];

device void d_vprod(float4 *vinp1, float4 *vinp2, float4 *vout1, float4 *vout2, int ien2)

{

float4 in1 = *vinp1, in2 = *vinp2;

ifdef WORKAROUND_ON

float bug1, bug2;

bug1 = em2(ien2,28);

bug2 = in1.w*in2.z;

endif

vout1->x = em2(ien2,6) in1.xin2.w + em2(ien2,24)in1.win2.x;

vout1->y = em2(ien2,14)*in1.y*in2.w + em2(ien2,26)*in1.w*in2.y;

ifdef WORKAROUND_ON

vout1->z = em2(ien2,22)*in1.z*in2.w + bug1*bug2;

else

vout1->z = em2(ien2,22)*in1.z*in2.w + em2(ien2,28)*in1.w*in2.z;

endif

vout1->w = em2(ien2,20)*in1.z*in2.z + em2(ien2,30)*in1.w*in2.w;

vout2->x = em2(ien2,12)in1.yin2.z + em2(ien2,18)in1.zin2.y;

vout2->y = em2(ien2,4) *in1.x*in2.z + em2(ien2,16)*in1.z*in2.x;

vout2->z = em2(ien2,2) *in1.x*in2.y + em2(ien2,8) *in1.y*in2.x;

vout2->w = em2(ien2,0) *in1.x*in2.x + em2(ien2,10)*in1.y*in2.y;

vout1->w += vout2->w;

vout2->w = 0;

}

global void d_gprod(unsigned short code, float4 *vinp1, float4 *vinp2, float4 *vout1, float4 *vout2,

                    unsigned char* outputCouples2, unsigned char* outtype, int ien2)

{

const unsigned long long outswaps = 0x55aa55aa55aa55aaLL;

unsigned short code0;

float4 *voutS, *voutP;

unsigned short swapon;

int tid = blockIdx.x*512+threadIdx.x;

code0 = code & 0x3f;

swapon = (unsigned short) BIT(outswaps, code0);

if (swapon) {

    voutS = vout2+tid;

    voutP = vout1+tid;

} else {

    voutS = vout1+tid;

    voutP = vout2+tid;

}

d_vprod(vinp1, vinp2+tid, voutS, voutP, ien2);

*(outtype+tid) = outputCouples2[code0];

}

inline void __checkError(cudaError_t e, char* filename, int line)

{

if (e != cudaSuccess)

{

 fprintf(stderr, "Runtime API error in file <%s>, line %i.\nError code %d: %s.\n",

         filename, line, e, cudaGetErrorString(e) );

 exit(-1);

}

inline void checkKernelError()

{

cudaError_t e = cudaGetLastError();

if (e != cudaSuccess)

{

  printf("Kernel did not launch successfully.\nError code %d: %s.\n",e,cudaGetErrorString(e));

  exit(-1);

}

cudaThreadSynchronize();

e = cudaGetLastError();

if (e != cudaSuccess)

{

  printf("Kernel did not complete successfully.\nError code %d: %s.\n",e,cudaGetErrorString(e));

  exit(-1);

}

int main(int argc, char** argv)

{

// INIT

NTHREADS = 1024*512;

for(int i=0; i<=63; i++) outputCouples2[i]=0;

cudaEvent_t start, stop, start_total, stop_total;

checkError( cudaEventCreate(&start) );

checkError( cudaEventCreate(&stop) );

checkError( cudaEventCreate(&start_total) );

checkError( cudaEventCreate(&stop_total) );

// HOST MEMORY ALLOCATION

unsigned short codein;

float4 qin1;

float4* qin2 = (float4*) malloc(NTHREADS*sizeof(float4));

float4* qout1 = (float4*) malloc(NTHREADS*sizeof(float4));

float4* qout2 = (float4*) malloc(NTHREADS*sizeof(float4));

unsigned char* codeout = (unsigned char*) malloc(NTHREADS*sizeof(unsigned char));

// DEVICE MEMORY ALLOCATION

float4* d_qin1; checkError(cudaMalloc((void**)&d_qin1,sizeof(float4)));

float4* d_qin2; checkError(cudaMalloc((void**)&d_qin2,NTHREADS*sizeof(float4)));

float4* d_qout1; checkError(cudaMalloc((void**)&d_qout1,NTHREADS*sizeof(float4)));

float4* d_qout2; checkError(cudaMalloc((void**)&d_qout2,NTHREADS*sizeof(float4)));

unsigned char* d_outputCouples2; checkError(cudaMalloc((void**)&d_outputCouples2,64*sizeof(unsigned char)));

unsigned char* d_codeout; checkError(cudaMalloc((void**)&d_codeout,NTHREADS*sizeof(unsigned char)));

// DATA TRANSFER TO DEVICE

checkError(cudaMemcpy(d_outputCouples2,outputCouples2,64*siz

eof(unsigned char),cudaMemcpyHostToDevice));

// TIMER INIT

float gpu_kernel_time = 0, gpu_total_time = 0;

// OPERANDS INIT

qin1.x = drand; qin1.y = drand; qin1.z = drand; qin1.w = drand;

for (int i=0; i<NTHREADS; i++)

{

  qin2[i].x = drand; qin2[i].y=drand; qin2[i].z = drand; qin2[i].w = drand;

}

codein = 0;

// TIMER START (KERNEL + DATA TRANSFER)

checkError(cudaEventRecord(start_total, 0));

// DATA TRANSFER TO DEVICE

checkError(cudaMemcpy(d_qin2,qin2,NTHREADS*sizeof(float4),cu

daMemcpyHostToDevice));

checkError(cudaMemcpy(d_qin1,&qin1,sizeof(float4),cudaMemcpyHostToDevice));

// TIMER START (KERNEL ONLY)

checkError(cudaEventRecord(start, 0));

// COMPUTATION ON DEVICE (KERNEL EXECUTION)

dim3 dimBlock(((NTHREADS<=512)?NTHREADS:512),1);

dim3 dimGrid(((NTHREADS<=512)?1:NTHREADS/512),1);

d_gprod<<<dimGrid,dimBlock>>>(codein, d_qin1, d_qin2, d_qout1, d_qout2, d_outputCouples2, d_codeout, 0);

checkKernelError();

// TIMER END (KERNEL ONLY)

checkError(cudaEventRecord(stop, 0));

checkError(cudaEventSynchronize(stop));

checkError(cudaEventElapsedTime(&gpu_kernel_time, start, stop));

// DATA TRANSFER TO HOST

checkError(cudaMemcpy(qout1,d_qout1,NTHREADS*sizeof(float4),

cudaMemcpyDeviceToHost));

checkError(cudaMemcpy(qout2,d_qout2,NTHREADS*sizeof(float4),

cudaMemcpyDeviceToHost));

checkError(cudaMemcpy(codeout,d_codeout,NTHREADS*sizeof(unsi

gned char),cudaMemcpyDeviceToHost));

// TIMER END (KERNEL + DATA TRANSFER)

checkError(cudaEventRecord(stop_total, 0));

checkError(cudaEventSynchronize(stop_total));

checkError(cudaEventElapsedTime(&gpu_total_time, start_total, stop_total));

// SHOW FIRST 3 RESULTS

for(int i=0; i<=2; i++)

{

  printf("--- Operation n. %d ---\n", i+1);

  printf("(%.2f, %.2f, %.2f, %.2f) GP ", qin1.x,  qin1.y,  qin1.z,  qin1.w);

  printf("(%.2f, %.2f, %.2f, %.2f) = ", qin2[i].x,  qin2[i].y,  qin2[i].z,  qin2[i].w);

  printf("(%.2f, %.2f, %.2f, %.2f) + ", qout1[i].x, qout1[i].y, qout1[i].z, qout1[i].w);

  printf("(%.2f, %.2f, %.2f, %.2f)\n", qout2[i].x, qout2[i].y, qout2[i].z, qout2[i].w);

  //printf("Output code: %d \n", codeout[i]);

}

// SHOW TOTAL TIME

printf(“\nNumber of threads: %li\n”,NTHREADS);

printf(“GPU kernel time: %f msec\n”,gpu_kernel_time);

printf(“GPU total time: %f msec\n”,gpu_total_time);

return 0;

}[/codebox]

shifter1 · May 13, 2009, 12:51pm

How many registers are you using per kernel? Multiply the number of threads you are creating by the number of registers per kernel and post the result.

Jamie_K · May 13, 2009, 3:18pm

I agree, it is very likely that you are using too many registers. With a whopping 512 threads per block, there are not enough registers in a multiprocessor to launch even one block. A small rearrangement of the code could change the register usage just enough to break your kernel.

Change the block size to 128 threads per block and use 4 times as many blocks.

Carmelo · May 13, 2009, 9:07pm

shifter1, Jamie K

You have correctly addressed the centre of the problem!

I have compiled only the kernel to see the cubin file

and I discovered that it uses 18 registers.

Interestingly, it uses only 16 registers with the workaround,

in spite of the two new float4 variables.

Considering that:

[*] the maximum number of registers per block is 8192

[*] 16 * 512 = 8192

[*] 8192 / 18 = 455

the mistery is solved. :-)

As suggested, I reduced the number of threads per block

and now the code works correctly.

Thank you very much indeed.

Topic		Replies	Views
Fewer than MaxThreads per Block Fails (Code Included) 400 threads per block fails while 300 successf CUDA Programming and Performance	9	1147	March 24, 2011
TOO MANY RESOURCES REQUESTED FOR LAUNCH CUDA Programming and Performance	16	11905	September 2, 2008
Wrong output when adding blocks what am I doing wrong? CUDA Programming and Performance	13	12018	December 4, 2007
How to debug kernel throwing an exception? CUDA Programming and Performance	16	8019	June 14, 2013
Error on Kernel launch CUDA Programming and Performance	7	1794	April 1, 2010
How to calculate register resource correctly? I met strange problem on calculate resource such as re CUDA Programming and Performance	2	1306	July 27, 2010
Maximum Threads for Kernel Call CUDA Programming and Performance	38	16514	May 25, 2010
emu vs debug, different values CUDA Programming and Performance	48	15859	February 5, 2009
Emulator works but G80 doesn't CUDA Programming and Performance	11	5453	July 3, 2007
Matrix multiplication ERRORS & few thoughts on CUDA Basic programming errors need correction CUDA Programming and Performance	14	13348	January 24, 2009

Very strange behaviour. Maybe a bug...? Kernel fails to run strangely, but no errors are reported.

Related topics