printf inside a kernel is not working nVIDIA Quadro 4000

Hello everyone!

I’m testing some code on a Quadro 4000 (compute 2.0) so I was wondering how to print within a kernel.

I’ve tried this piece of code:

#include <stdio.h>

#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)

#define printf(f, ...) ((void)(f, __VA_ARGS__),0)

#endif

__global__ void helloCUDA(float f)

{

if (threadIdx.x == 0)

printf("Hello thread %d, f=%f\n", threadIdx.x, f) ;

}

int main()

{

helloCUDA<<<1, 5>>>(1.2345f);

cudaDeviceReset();

return 0;

}

And it works just fine.

Very excited about that, I’ve tried to do the same in my code as following:

#include <stdio.h>

#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)

#define printf(f, ...) ((void)(f, __VA_ARGS__),0)

#endif

texture<short> tex_phi;

texture<short> tex_mask;

__device__ __constant__ short d_N[2]; // NL, NC

extern __shared__ int cache[];

__global__ void mask_prop_fine( int* mask_num, int* mask_id, short* vars, short var ){

    short gid = threadIdx.x + blockIdx.x * blockDim.x;

    short NL = d_N[0];

    short NC = d_N[1];

    int* s_mask_num = (int*)cache;

    int* s_mask_id = (int*)&s_mask_num[NC];

if(gid < NL){

        short bid = threadIdx.x;

        short l, m, v, um;

if(gid == abs(var) && var > 0) vars[gid] = 1;

        else if(gid == abs(var) && var < 0) vars[gid] = -1;

        __syncthreads();

l = tex1Dfetch(tex_phi, gid);

        m = tex1Dfetch(tex_mask, gid);

        um = abs(m);

        v = vars[abs(l)];

        s_mask_num[um] = 0;

        printf("Thread %d, l = %d, m = %d, v = %d\n", 

                   threadIdx.x, l, m, v) ;

if ((( l < 0 ) && ( v == -1 )) ||    

            (( l > 0 ) && ( v == 1 )))       

            s_mask_num[um] = NL;                   

if(s_mask_num[um] < NL && v == 0){

            atomicAdd( &s_mask_num[um], 1);

            s_mask_id[um] = gid;

        }

        __syncthreads();

if(m < 0 || gid == 0){

            mask_num[um] = 0;

            atomicAdd(&mask_num[um], s_mask_num[um]);

            mask_id[um] = s_mask_id[um];

        }

        __syncthreads();

    }

}

Adding:

mask_prop_fine<<<blocks, threads>>>( mask_num, mask_id, vars, var );

   cudaDeviceReset();

Compiled without warnings but then the printf just did not work :(

What I’ve done wrong? Any ideas? I’m quite a noob with CUDA framework, I know…

I haven’t looked closely, but one problem stands out: You are using __syncthreads() in conditional code that is not reached by all threads. Move them out of the conditional:

__global__ void mask_prop_fine( int* mask_num, int* mask_id, short* vars, short var ){

    short gid = threadIdx.x + blockIdx.x * blockDim.x;

    short NL = d_N[0];

    short NC = d_N[1];

    int* s_mask_num = (int*)cache;

    int* s_mask_id = (int*)&s_mask_num[NC];

if (gid < NL) {

        short bid = threadIdx.x;

        short l, m, v, um;

if (gid == abs(var) && var > 0)

            vars[gid] = 1;

        else if (gid == abs(var) && var < 0)

            vars[gid] = -1;

    }

    __syncthreads();

    if (gid < NL) {

        l = tex1Dfetch(tex_phi, gid);

        m = tex1Dfetch(tex_mask, gid);

        um = abs(m);

        v = vars[abs(l)];

        s_mask_num[um] = 0;

        printf("Thread %d, l = %d, m = %d, v = %d\n", 

                   threadIdx.x, l, m, v) ;

if ((( l < 0 ) && ( v == -1 )) ||    

            (( l > 0 ) && ( v == 1 )))       

            s_mask_num[um] = NL;                   

if (s_mask_num[um] < NL && v == 0) {

            atomicAdd( &s_mask_num[um], 1);

            s_mask_id[um] = gid;

        }

    }

    __syncthreads();

    if (gid < NL) {

        if(m < 0 || gid == 0){

            mask_num[um] = 0;

            atomicAdd(&mask_num[um], s_mask_num[um]);

            mask_id[um] = s_mask_id[um];

        }

    }

    __syncthreads();

}

Your kernel needs to exit cleanly in order for printf to actually print. The prints do not occur in realtime but are instead saved to a buffer which is printed after the kernel exits.