printf vs cuPrintf in kernels


I discovered that using printf in kernels gives me a lot of problems.
Wrong printed values, even crash of the system == no rerun possible until reboot.

This is for a GTX 660 kepler card. driver (304.48) linux ubuntu 10.04

However, using cuPrintf works perfectly.

Just to let you know.

The old cuPrintf has a different default buffer size, so it might succeed where the builtin printf fails.

Look at the CUDA programming guide under the “Formatted Output” section.
It’s likely all you need to do is increase your printf buffer size with

cudaDeviceSetLimit(cudaLimitPrintfFifoSize, size_t size)

New size does not help. Try this test code:

#include <stdio.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include ""
#include "cuPrintf.cuh"

    #define add(A, B)         (A+B)
    #define mul(A, B)         (A*B)

#define BLK_N 2
#define BLK_M 2
// size of thread block for calculating C (innermost loop)
#define DIM_X 2
#define DIM_Y 2

__global__ void     kernel_name (int M, int N, float *C,float *A, float alpha, float beta)

    int idx = threadIdx.x;  // thread's m dimension
    int idy = threadIdx.y;  // thread's n dimension

    int blx = blockIdx.x;   // block's m dimension
    int bly = blockIdx.y;   // block's n dimension

//printf ( "idx,idy,blx,bly %d %d %d %d  \n",idx,idy,blx,bly);   //        --> bad  results
cuPrintf ( "idx,idy,blx,bly %d %d %d %d  \n",idx,idy,blx,bly);   //        --> good results

// Store C regs->dev
    #pragma unroll
    for (unsigned int n = 0; n < N; n++) {
        int coord_dCn = bly*BLK_N + n*DIM_Y+idy;
cuPrintf ( "n coord_dCn  %u %d   \n",n,coord_dCn);
        #pragma unroll
        for (unsigned int m = 0; m < M; m++) {
            int coord_dCm = blx*BLK_M + m*DIM_X+idx;
cuPrintf ( "n m   %u %u   \n",n,m);
            if (coord_dCm < M && coord_dCn < N) {
                int offsC = coord_dCn*4 + coord_dCm;
cuPrintf ( "coord_dCm offsC %d %d   \n",coord_dCm,offsC);

               float &regC = A[coord_dCm + coord_dCn];
                float &memC = C[offsC];
                memC = add(mul(alpha, regC), mul(beta, memC));
cuPrintf("&memC %x  memC  %10.4f offsC %3d C[offsC] %10.4f \n",&memC,memC,offsC,C[offsC]);

void randomInit(float *data, int size)
    for (int i = 0; i < size; ++i){
    data[i] =  rand() / (float)RAND_MAX;}

int main(int argc, char **argv)
    int cuda_device = 0;

    cuda_device = findCudaDevice(argc, (const char **)argv);
        size_t buf=1e6;
        cudaDeviceSetLimit(cudaLimitPrintfFifoSize,  buf);
        int M =4, N = 4;
        int msize = 16*sizeof(float);
// allocate host memory
    float *a = 0;                     // pointer to the array data in host memory
    float *c = 0;                     // pointer to the array data in host memory
    checkCudaErrors(cudaMallocHost((void **)&a, msize));
    checkCudaErrors(cudaMallocHost((void **)&c, msize));

    // allocate device memory
    float *d_a = 0;             // pointers to data and init value in the device memory
    float *d_c = 0;             // pointers to data and init value in the device memory
    checkCudaErrors(cudaMalloc((void **)&d_a, msize));
    checkCudaErrors(cudaMalloc((void **)&d_c, msize));

    dim3 bloc(BLK_N,BLK_M,1);

    dim3 grid(M/ bloc.x, N / bloc.y);

    // initialize host memory

    randomInit( a,M*N );
for ( int i = 0; i < M*N ; i++)
printf ( "i %d a[i] %f \n",i,a[i]);

// copy host memory to device

        cudaMemcpyAsync(d_a, a, msize, cudaMemcpyHostToDevice);

       kernel_name<<< grid, bloc, 0>>>(M,N,  d_c,d_a,  1.1f, 0.10f);

        cudaMemcpyAsync(c,d_c,msize,cudaMemcpyDeviceToHost) ;


for ( int i = 0; i < M*N ; i++)
printf ( "i %d c[i] %f \n",i,c[i]);