Problem using cuPrintf when combining CUDA with ITK

I am trying to use cuPrintf when combining CUDA with ITK. When I use the example in cuPrintf.cu, it is successful. The codes look like:

main.cu:

        #include "cuPrintf.cu"

      __global__ void testKernel(int val){
     
          cuPrintf("Value is: %d\n", val);
     }

     int main() {
            cudaPrintfInit();
           testKernel<<< 2, 3 >>>(10);
           cudaPrintfDisplay(stdout, true);
           cudaPrintfEnd();
           return 0;
    }

However, since I am using CUDA and ITK, I want to wrap the codes and put it in another file (separated from main.cpp)

It looks like:

main.cpp:

      extern "C" void cuPrintf_wrapper(); 

      int main(){
             cuPrintf_wrapper();
             return 0;
      }

cuPrintf_wrapper.cu:

      #include "cuPrintf.cu"
  #include <stdio.h>

     __global__ void testKernel(int val)
    {
          int bSize = blockDim.x;
          int bid = blockIdx.x;
          int tid = bid * bSize + threadIdx.x;

          cuPrintf("Value is: %d\n", val);
          cuPrintf("block is %d, thread is %d, tid is %d\n", bid, threadIdx.x, tid);
     }

     extern "C"  void cuPrintf_wrapper() {
            cudaPrintfInit();
            testKernel<<< 2, 3 >>>(10);
           cudaPrintfDisplay(stdout, true);
           cudaPrintfEnd();

     }

The compilation of main.cpp and cuPrintf_wrapper.cu is successful. However, no output is generated. I tried to output the results
into a text file. Same thing happened. The first version worked, while the second version did not. Can anyone give me a hint?

Thanks. Yongsheng

I’m having some difficulties with cuPrintf myself, but I don’t have any difficulty doing what you describe.

As an example, take the simplePrintf example that comes in the SDK.

If you create a file main.cpp that contains only:

int wrapper(int argc, char **argv);

int main(int argc, char **argv)
{
wrapper( argc, argv );
return 0;
}

and change “main” to “wrapper” in simplePrintf.cu:

*** 30,36 ****
CUPRINTF(“\tValue is:%d\n”, val);
}

! int main(int argc, char **argv)
{
int devID;
cudaDeviceProp props;
— 30,36 ----
CUPRINTF(“\tValue is:%d\n”, val);
}

! int wrapper(int argc, char **argv)
{
int devID;
cudaDeviceProp props;

and change Makefile to include main.cpp and use the right architecture for you GPU (mine is 1.1 for my laptop):

*** 36,45 ****

Add source files here

EXECUTABLE := simplePrintf

Cuda source files (compiled with cudacc)

! CUFILES_sm_10 := simplePrintf.cu
CUDEPS := cuPrintf.cu

C/C++ source files (compiled with gcc / c++)

! CCFILES := \

add command line parameters so we can target multiple architectures

GENCODE_ARCH := -gencode=arch=compute_10,code="sm_10,compute_10" -gencode=arch=compute_13,code="sm_13,compute_13" -gencode=arch=compute_20,code="sm_20,compute_20"
— 36,45 ----

Add source files here

EXECUTABLE := simplePrintf

Cuda source files (compiled with cudacc)

! CUFILES_sm_11 := simplePrintf.cu
CUDEPS := cuPrintf.cu

C/C++ source files (compiled with gcc / c++)

! CCFILES := main.cpp

everything seems to work:

% verbose=1 make
mkdir -p …/…/lib
mkdir -p obj/x86_64/release
mkdir -p …/…/bin/linux/release
g++ -W -Wall -Wimplicit -Wswitch -Wformat -Wchar-subscripts -Wparentheses -Wmultichar -Wtrigraphs -Wpointer-arith -Wcast-align -Wreturn-type -Wno-unused-function -m64 -fno-strict-aliasing -I. -I/usr/local/cuda/include -I…/…/common/inc -I…/…/…/shared//inc -DUNIX -O2 -o obj/x86_64/release/main.cpp.o -c main.cpp
/usr/local/cuda/bin/nvcc -gencode=arch=compute_11,code="sm_11,compute_11" -o obj/x86_64/release/simplePrintf.cu_11.o -c simplePrintf.cu -m64 --compiler-options -fno-strict-aliasing -I. -I/usr/local/cuda/include -I…/…/common/inc -I…/…/…/shared//inc -DUNIX -O2
g++ -fPIC -m64 -o …/…/bin/linux/release/simplePrintf obj/x86_64/release/main.cpp.o obj/x86_64/release/simplePrintf.cu_11.o -L/usr/local/cuda/lib64 -L…/…/lib -L…/…/common/lib/linux -L…/…/…/shared//lib -lcudart -L/usr/local/cuda/lib64 -L…/…/lib -L…/…/common/lib/linux -L…/…/…/shared//lib -lcudart -lcutil_x86_64 -lshrutil_x86_64 -ldl -lpthread

% …/…/bin/linux/release/simplePrintf --noprompt

Using CUDA device [0]: Quadro FX 3700M
Device 0: “Quadro FX 3700M” with Compute 1.1 capability
cuPrintf() is called. Output:

magic=51217
[2, 0]: Value is:10
[2, 1]: Value is:10
[2, 2]: Value is:10
[2, 3]: Value is:10
[1, 0]: Value is:10
[1, 1]: Value is:10
[2, 4]: Value is:10
[1, 2]: Value is:10
[2, 5]: Value is:10
[1, 3]: Value is:10
[2, 6]: Value is:10
[2, 7]: Value is:10
[1, 4]: Value is:10
[1, 5]: Value is:10
[1, 6]: Value is:10
[1, 7]: Value is:10
[0, 0]: Value is:10
[0, 1]: Value is:10
[0, 2]: Value is:10
[0, 3]: Value is:10
[0, 4]: Value is:10
[0, 5]: Value is:10
[0, 6]: Value is:10
[0, 7]: Value is:10
[3, 0]: Value is:10
[3, 1]: Value is:10
[3, 2]: Value is:10
[3, 3]: Value is:10
[3, 4]: Value is:10
[3, 5]: Value is:10
[3, 6]: Value is:10
[3, 7]: Value is:10

PASSED