Very strange behaviour. Maybe a bug...? Kernel fails to run strangely, but no errors are reported.

----- Synopsis description of the problem -----

Kernel fails to run strangely.

----- Detailed description of the problem -----

Kernel fails to run:

return data are incorrect (tipically zeros),

kernel execution time is almost zero,

but no errors are reported.

Kernel works incorrectly when

the following operation is performed:

[codebox]// UNCOMMENT THE FOLLOWING LINE TO "SOLVE" THE PROBLEM

//#define WORKAROUND_ON

#include <cuda_runtime.h>

#include <stdio.h>

#include <cutil_inline.h>

#define em2(xmask, k) ((float) ( (((xmask) >> (k)) & 3) - 1))

#define BIT(PROG, IND) (((PROG) >> (IND) ) & 1)

#define drand (((double) rand()) / RAND_MAX)

long int NTHREADS;

unsigned char outputCouples2[64];

device void d_vprod(float4 *vinp1, float4 *vinp2, float4 *vout1, float4 *vout2, int ien2)

{

float4 in1 = *vinp1, in2 = *vinp2;

#ifdef WORKAROUND_ON

float bug1, bug2;

bug1 = em2(ien2,28);

bug2 = in1.w*in2.z;

#endif

vout1->x = em2(ien2,6)(in1.x)(in2.w) + em2(ien2,24)(in1.w)(in2.x);

vout1->y = em2(ien2,14)*(in1.y)*(in2.w) + em2(ien2,26)*(in1.w)*(in2.y);

#ifdef WORKAROUND_ON

vout1->z = em2(ien2,22)*(in1.z)*(in2.w) + bug1*bug2;

#else

vout1->z = em2(ien2,22)*(in1.z)*(in2.w) + em2(ien2,28)*in1.w*in2.z;

#endif

vout1->w = em2(ien2,20)*(in1.z)*(in2.z) + em2(ien2,30)*(in1.w)*(in2.w); //scalar

vout2->x = em2(ien2,12)(in1.y)(in2.z) + em2(ien2,18)(in1.z)(in2.y);

vout2->y = em2(ien2,4)*(in1.x)*(in2.z) + em2(ien2,16)*(in1.z)*(in2.x);

vout2->z = em2(ien2,2)*(in1.x)*(in2.y) + em2(ien2,8)*(in1.y)*(in2.x);

vout2->w = em2(ien2,0)*(in1.x)*(in2.x) + em2(ien2,10)*(in1.y)*(in2.y); //scalar

vout1->w += vout2->w;

vout2->w = 0;

}

global void d_gprod(unsigned short code, float4 *vinp1, float4 *vinp2, float4 *vout1, float4 *vout2,

                    unsigned char* outputCouples2, unsigned char* outtype, int ien2)

{

const unsigned long long outswaps = 0x55aa55aa55aa55aaLL;

unsigned short code0;

float4 *voutS, *voutP;

unsigned short swapon;

int tid = blockIdx.x*512+threadIdx.x;

code0 = code & 0x3f;

swapon = (unsigned short) BIT(outswaps, code0);

if (swapon) {

    voutS = vout2+tid;

    voutP = vout1+tid;

} else {

    voutS = vout1+tid;

    voutP = vout2+tid;

}

d_vprod(vinp1, vinp2+tid, voutS, voutP, ien2);

*(outtype+tid) = outputCouples2[code0];

}

int main(int argc, char** argv)

{

// INIT

NTHREADS = 512*512;

for(int i=0; i<=63; i++) outputCouples2[i]=0;

// HOST MEMORY ALLOCATION

unsigned short codein;

float4 qin1;

float4* qin2 = (float4*) malloc(NTHREADS*sizeof(float4));

float4* qout1 = (float4*) malloc(NTHREADS*sizeof(float4));

float4* qout2 = (float4*) malloc(NTHREADS*sizeof(float4));

unsigned char* codeout = (unsigned char*) malloc(NTHREADS*sizeof(unsigned char));

// DEVICE MEMORY ALLOCATION

float4* d_qin1; cutilSafeCall(cudaMalloc((void**)&d_qin1,sizeof(float4)));

float4* d_qin2; cutilSafeCall(cudaMalloc((void**)&d_qin2,NTHREADS*sizeof(float4)));

float4* d_qout1; cutilSafeCall(cudaMalloc((void**)&d_qout1,NTHREADS*sizeof(float4)));

float4* d_qout2; cutilSafeCall(cudaMalloc((void**)&d_qout2,NTHREADS*sizeof(float4)));

unsigned char* d_outputCouples2; cutilSafeCall(cudaMalloc((void**)&d_outputCouples2,64*sizeof(unsigned char)));

unsigned char* d_codeout; cutilSafeCall(cudaMalloc((void**)&d_codeout,NTHREADS*sizeof(unsigned char)));

// DATA TRANSFER TO DEVICE

cutilSafeCall(cudaMemcpy(d_outputCouples2,outputCouples2,64*

sizeof(unsigned char),cudaMemcpyHostToDevice));

// TIMER INIT

double gpu_kernel_time = 0, gpu_total_time = 0;

unsigned int hTimer, hTimer2;

// OPERANDS INIT

qin1.x = drand; qin1.y = drand; qin1.z = drand; qin1.w = drand;

for (int i=0; i<NTHREADS; i++)

{

  qin2[i].x = drand; qin2[i].y=drand; qin2[i].z = drand; qin2[i].w = drand;

}

codein = 0;

// TIMER START (KERNEL + DATA TRANSFER)

cutilCheckError( cutCreateTimer(&hTimer2) );

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError( cutResetTimer(hTimer2) );

cutilCheckError( cutStartTimer(hTimer2) );

// DATA TRANSFER TO DEVICE

cutilSafeCall(cudaMemcpy(d_qin2,qin2,NTHREADS*sizeof(float4)

,cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(d_qin1,&qin1,sizeof(float4),cudaMemcpyHostToDevice));

// TIMER START (KERNEL ONLY)

cutilCheckError( cutCreateTimer(&hTimer) );

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError( cutResetTimer(hTimer) );

cutilCheckError( cutStartTimer(hTimer) );

// COMPUTATION ON DEVICE (KERNEL EXECUTION)

dim3 dimBlock(((NTHREADS<=512)?NTHREADS:512),1);

dim3 dimGrid(((NTHREADS<=512)?1:NTHREADS/512),1);

d_gprod<<<dimGrid,dimBlock>>>(codein, d_qin1, d_qin2, d_qout1, d_qout2, d_outputCouples2, d_codeout, 0);

// TIMER END (KERNEL ONLY)

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError( cutStopTimer(hTimer) );

gpu_kernel_time = cutGetTimerValue(hTimer);

// DATA TRANSFER TO HOST

cutilSafeCall(cudaMemcpy(qout1,d_qout1,NTHREADS*sizeof(float

4),cudaMemcpyDeviceToHost));

cutilSafeCall(cudaMemcpy(qout2,d_qout2,NTHREADS*sizeof(float

4),cudaMemcpyDeviceToHost));

cutilSafeCall(cudaMemcpy(codeout,d_codeout,NTHREADS*sizeof(u

nsigned char),cudaMemcpyDeviceToHost));

// TIMER END (KERNEL + DATA TRANSFER)

cutilSafeCall( cudaThreadSynchronize() );

cutilCheckError( cutStopTimer(hTimer2) );

gpu_total_time = cutGetTimerValue(hTimer2);

// SHOW FIRST 3 RESULTS

for(int i=0; i<=2; i++)

{

  printf("--- Operation n. %d ---\n", i+1);

  printf("(%.2f, %.2f, %.2f, %.2f) GP ", qin1.x,  qin1.y,  qin1.z,  qin1.w);

  printf("(%.2f, %.2f, %.2f, %.2f) = ", qin2[i].x,  qin2[i].y,  qin2[i].z,  qin2[i].w);

  printf("(%.2f, %.2f, %.2f, %.2f) + ", qout1[i].x, qout1[i].y, qout1[i].z, qout1[i].w);

  printf("(%.2f, %.2f, %.2f, %.2f)\n", qout2[i].x, qout2[i].y, qout2[i].z, qout2[i].w);

  //printf("Output code: %d \n", codeout[i]);

}

// SHOW TOTAL TIME

printf("\nNumber of threads: %li\n",NTHREADS);

printf(“GPU kernel time: %lf msec\n”,gpu_kernel_time);

printf(“GPU total time: %lf msec\n”,gpu_total_time);

return 0;

}[/codebox]

  1. Stop using cutil. It should not be used by anyone and is absolutely not a substitute for doing robust error checking yourself. It’s a convenience wrapper for sample code, nothing more. (note to self: hide TMURRAY_WILL_HUNT_YOU_DOWN in common.mk, wrap big #ifdefs in cutil.h so people can’t use it unless they actually look at it and therefore realize that it is not something they should be using)

  2. Do timing with CUDA events instead of CPU side timers.

  3. You’re not doing error checking correctly. Most likely you’re not actually launching the kernel. To check errors correctly in the runtime API, you actually have to do:

kernel<<<...>>>();

if (cudaGetLastError() != cudaSuccess)

  {

	printf("Kernel did not launch successfully\n");

	..

  }

cudaThreadSynchronize();

if (cudaGetLastError() != cudaSuccess)

  {

	printf("Kernel did not complete successfully\n");

	..

  }

First of all, thank you for the prompt answer.

  1. Stop using cutil.

Ok. Done.

  1. Do timing with CUDA events instead of CPU side timers.

Ok. Done.

  1. You’re not doing error checking correctly.

Most likely you’re not actually launching the kernel.

To check errors correctly in the runtime API, you actually have to do:

[…]

Ok. Done.

The problem persists.

The kernel fails to launch, reporting error 7:

“too many resources requested for launch”.

It doesn’t make sense,

because kernel launch fails, for example,

with only 449 threads on configuration -C-…

but it works correctly with over 4 million threads

with the workaround described above

where I use two more float4 variables.

Is there any explanation?

Complete updated code follows:

[codebox]

// UNCOMMENT THE FOLLOWING LINE TO “SOLVE” THE PROBLEM

//#define WORKAROUND_ON

#include <cuda_runtime.h>

#include <stdio.h>

#define em2(xmask, k) ((float) ( (((xmask) >> (k)) & 3) - 1))

#define BIT(PROG, IND) (((PROG) >> (IND) ) & 1)

#define drand (((double) rand()) / RAND_MAX)

#define checkError(err) __checkError(err, FILE, LINE)

long int NTHREADS;

unsigned char outputCouples2[64];

device void d_vprod(float4 *vinp1, float4 *vinp2, float4 *vout1, float4 *vout2, int ien2)

{

float4 in1 = *vinp1, in2 = *vinp2;

#ifdef WORKAROUND_ON

float bug1, bug2;

bug1 = em2(ien2,28);

bug2 = in1.w*in2.z;

#endif

vout1->x = em2(ien2,6) in1.xin2.w + em2(ien2,24)in1.win2.x;

vout1->y = em2(ien2,14)*in1.y*in2.w + em2(ien2,26)*in1.w*in2.y;

#ifdef WORKAROUND_ON

vout1->z = em2(ien2,22)*in1.z*in2.w + bug1*bug2;

#else

vout1->z = em2(ien2,22)*in1.z*in2.w + em2(ien2,28)*in1.w*in2.z;

#endif

vout1->w = em2(ien2,20)*in1.z*in2.z + em2(ien2,30)*in1.w*in2.w;

vout2->x = em2(ien2,12)in1.yin2.z + em2(ien2,18)in1.zin2.y;

vout2->y = em2(ien2,4) *in1.x*in2.z + em2(ien2,16)*in1.z*in2.x;

vout2->z = em2(ien2,2) *in1.x*in2.y + em2(ien2,8) *in1.y*in2.x;

vout2->w = em2(ien2,0) *in1.x*in2.x + em2(ien2,10)*in1.y*in2.y;

vout1->w += vout2->w;

vout2->w = 0;

}

global void d_gprod(unsigned short code, float4 *vinp1, float4 *vinp2, float4 *vout1, float4 *vout2,

                    unsigned char* outputCouples2, unsigned char* outtype, int ien2)

{

const unsigned long long outswaps = 0x55aa55aa55aa55aaLL;

unsigned short code0;

float4 *voutS, *voutP;

unsigned short swapon;

int tid = blockIdx.x*512+threadIdx.x;

code0 = code & 0x3f;

swapon = (unsigned short) BIT(outswaps, code0);

if (swapon) {

    voutS = vout2+tid;

    voutP = vout1+tid;

} else {

    voutS = vout1+tid;

    voutP = vout2+tid;

}

d_vprod(vinp1, vinp2+tid, voutS, voutP, ien2);

*(outtype+tid) = outputCouples2[code0];

}

inline void __checkError(cudaError_t e, char* filename, int line)

{

if (e != cudaSuccess)

{

 fprintf(stderr, "Runtime API error in file <%s>, line %i.\nError code %d: %s.\n",

         filename, line, e, cudaGetErrorString(e) );

 exit(-1);

}

}

inline void checkKernelError()

{

cudaError_t e = cudaGetLastError();

if (e != cudaSuccess)

{

  printf("Kernel did not launch successfully.\nError code %d: %s.\n",e,cudaGetErrorString(e));

  exit(-1);

}

cudaThreadSynchronize();

e = cudaGetLastError();

if (e != cudaSuccess)

{

  printf("Kernel did not complete successfully.\nError code %d: %s.\n",e,cudaGetErrorString(e));

  exit(-1);

}

}

int main(int argc, char** argv)

{

// INIT

NTHREADS = 1024*512;

for(int i=0; i<=63; i++) outputCouples2[i]=0;

cudaEvent_t start, stop, start_total, stop_total;

checkError( cudaEventCreate(&start) );

checkError( cudaEventCreate(&stop) );

checkError( cudaEventCreate(&start_total) );

checkError( cudaEventCreate(&stop_total) );

// HOST MEMORY ALLOCATION

unsigned short codein;

float4 qin1;

float4* qin2 = (float4*) malloc(NTHREADS*sizeof(float4));

float4* qout1 = (float4*) malloc(NTHREADS*sizeof(float4));

float4* qout2 = (float4*) malloc(NTHREADS*sizeof(float4));

unsigned char* codeout = (unsigned char*) malloc(NTHREADS*sizeof(unsigned char));

// DEVICE MEMORY ALLOCATION

float4* d_qin1; checkError(cudaMalloc((void**)&d_qin1,sizeof(float4)));

float4* d_qin2; checkError(cudaMalloc((void**)&d_qin2,NTHREADS*sizeof(float4)));

float4* d_qout1; checkError(cudaMalloc((void**)&d_qout1,NTHREADS*sizeof(float4)));

float4* d_qout2; checkError(cudaMalloc((void**)&d_qout2,NTHREADS*sizeof(float4)));

unsigned char* d_outputCouples2; checkError(cudaMalloc((void**)&d_outputCouples2,64*sizeof(unsigned char)));

unsigned char* d_codeout; checkError(cudaMalloc((void**)&d_codeout,NTHREADS*sizeof(unsigned char)));

// DATA TRANSFER TO DEVICE

checkError(cudaMemcpy(d_outputCouples2,outputCouples2,64*siz

eof(unsigned char),cudaMemcpyHostToDevice));

// TIMER INIT

float gpu_kernel_time = 0, gpu_total_time = 0;

// OPERANDS INIT

qin1.x = drand; qin1.y = drand; qin1.z = drand; qin1.w = drand;

for (int i=0; i<NTHREADS; i++)

{

  qin2[i].x = drand; qin2[i].y=drand; qin2[i].z = drand; qin2[i].w = drand;

}

codein = 0;

// TIMER START (KERNEL + DATA TRANSFER)

checkError(cudaEventRecord(start_total, 0));

// DATA TRANSFER TO DEVICE

checkError(cudaMemcpy(d_qin2,qin2,NTHREADS*sizeof(float4),cu

daMemcpyHostToDevice));

checkError(cudaMemcpy(d_qin1,&qin1,sizeof(float4),cudaMemcpyHostToDevice));

// TIMER START (KERNEL ONLY)

checkError(cudaEventRecord(start, 0));

// COMPUTATION ON DEVICE (KERNEL EXECUTION)

dim3 dimBlock(((NTHREADS<=512)?NTHREADS:512),1);

dim3 dimGrid(((NTHREADS<=512)?1:NTHREADS/512),1);

d_gprod<<<dimGrid,dimBlock>>>(codein, d_qin1, d_qin2, d_qout1, d_qout2, d_outputCouples2, d_codeout, 0);

checkKernelError();

// TIMER END (KERNEL ONLY)

checkError(cudaEventRecord(stop, 0));

checkError(cudaEventSynchronize(stop));

checkError(cudaEventElapsedTime(&gpu_kernel_time, start, stop));

// DATA TRANSFER TO HOST

checkError(cudaMemcpy(qout1,d_qout1,NTHREADS*sizeof(float4),

cudaMemcpyDeviceToHost));

checkError(cudaMemcpy(qout2,d_qout2,NTHREADS*sizeof(float4),

cudaMemcpyDeviceToHost));

checkError(cudaMemcpy(codeout,d_codeout,NTHREADS*sizeof(unsi

gned char),cudaMemcpyDeviceToHost));

// TIMER END (KERNEL + DATA TRANSFER)

checkError(cudaEventRecord(stop_total, 0));

checkError(cudaEventSynchronize(stop_total));

checkError(cudaEventElapsedTime(&gpu_total_time, start_total, stop_total));

// SHOW FIRST 3 RESULTS

for(int i=0; i<=2; i++)

{

  printf("--- Operation n. %d ---\n", i+1);

  printf("(%.2f, %.2f, %.2f, %.2f) GP ", qin1.x,  qin1.y,  qin1.z,  qin1.w);

  printf("(%.2f, %.2f, %.2f, %.2f) = ", qin2[i].x,  qin2[i].y,  qin2[i].z,  qin2[i].w);

  printf("(%.2f, %.2f, %.2f, %.2f) + ", qout1[i].x, qout1[i].y, qout1[i].z, qout1[i].w);

  printf("(%.2f, %.2f, %.2f, %.2f)\n", qout2[i].x, qout2[i].y, qout2[i].z, qout2[i].w);

  //printf("Output code: %d \n", codeout[i]);

}

// SHOW TOTAL TIME

printf("\nNumber of threads: %li\n",NTHREADS);

printf(“GPU kernel time: %f msec\n”,gpu_kernel_time);

printf(“GPU total time: %f msec\n”,gpu_total_time);

return 0;

}[/codebox]

How many registers are you using per kernel? Multiply the number of threads you are creating by the number of registers per kernel and post the result.

I agree, it is very likely that you are using too many registers. With a whopping 512 threads per block, there are not enough registers in a multiprocessor to launch even one block. A small rearrangement of the code could change the register usage just enough to break your kernel.

Change the block size to 128 threads per block and use 4 times as many blocks.

shifter1, Jamie K

You have correctly addressed the centre of the problem!

I have compiled only the kernel to see the cubin file

and I discovered that it uses 18 registers.

Interestingly, it uses only 16 registers with the workaround,

in spite of the two new float4 variables.

Considering that:

    the maximum number of registers per block is 8192

    16 * 512 = 8192

    8192 / 18 = 455

the mistery is solved. :-)

As suggested, I reduced the number of threads per block

and now the code works correctly.

Thank you very much indeed.