Hello everyone!
I’m testing some code on a Quadro 4000 (compute 2.0) so I was wondering how to print within a kernel.
I’ve tried this piece of code:
#include <stdio.h>
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
#define printf(f, ...) ((void)(f, __VA_ARGS__),0)
#endif
__global__ void helloCUDA(float f)
{
if (threadIdx.x == 0)
printf("Hello thread %d, f=%f\n", threadIdx.x, f) ;
}
int main()
{
helloCUDA<<<1, 5>>>(1.2345f);
cudaDeviceReset();
return 0;
}
And it works just fine.
Very excited about that, I’ve tried to do the same in my code as following:
#include <stdio.h>
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
#define printf(f, ...) ((void)(f, __VA_ARGS__),0)
#endif
texture<short> tex_phi;
texture<short> tex_mask;
__device__ __constant__ short d_N[2]; // NL, NC
extern __shared__ int cache[];
__global__ void mask_prop_fine( int* mask_num, int* mask_id, short* vars, short var ){
short gid = threadIdx.x + blockIdx.x * blockDim.x;
short NL = d_N[0];
short NC = d_N[1];
int* s_mask_num = (int*)cache;
int* s_mask_id = (int*)&s_mask_num[NC];
if(gid < NL){
short bid = threadIdx.x;
short l, m, v, um;
if(gid == abs(var) && var > 0) vars[gid] = 1;
else if(gid == abs(var) && var < 0) vars[gid] = -1;
__syncthreads();
l = tex1Dfetch(tex_phi, gid);
m = tex1Dfetch(tex_mask, gid);
um = abs(m);
v = vars[abs(l)];
s_mask_num[um] = 0;
printf("Thread %d, l = %d, m = %d, v = %d\n",
threadIdx.x, l, m, v) ;
if ((( l < 0 ) && ( v == -1 )) ||
(( l > 0 ) && ( v == 1 )))
s_mask_num[um] = NL;
if(s_mask_num[um] < NL && v == 0){
atomicAdd( &s_mask_num[um], 1);
s_mask_id[um] = gid;
}
__syncthreads();
if(m < 0 || gid == 0){
mask_num[um] = 0;
atomicAdd(&mask_num[um], s_mask_num[um]);
mask_id[um] = s_mask_id[um];
}
__syncthreads();
}
}
Adding:
mask_prop_fine<<<blocks, threads>>>( mask_num, mask_id, vars, var );
cudaDeviceReset();
Compiled without warnings but then the printf just did not work :(
What I’ve done wrong? Any ideas? I’m quite a noob with CUDA framework, I know…