printf vs cuPrintf in kernels

jam11 · February 5, 2013, 6:02pm

Hi,

I discovered that using printf in kernels gives me a lot of problems.
Wrong printed values, even crash of the system == no rerun possible until reboot.

This is for a GTX 660 kepler card. driver (304.48) linux ubuntu 10.04

However, using cuPrintf works perfectly.

Just to let you know.

SPWorley · February 5, 2013, 7:30pm

The old cuPrintf has a different default buffer size, so it might succeed where the builtin printf fails.

Look at the CUDA programming guide under the “Formatted Output” section.
It’s likely all you need to do is increase your printf buffer size with

cudaDeviceSetLimit(cudaLimitPrintfFifoSize, size_t size)

jam11 · February 5, 2013, 11:02pm

New size does not help. Try this test code:

#include <stdio.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include "cuprintf.cu"
#include "cuPrintf.cuh"

    #define add(A, B)         (A+B)
    #define mul(A, B)         (A*B)


#define BLK_N 2
#define BLK_M 2
// size of thread block for calculating C (innermost loop)
#define DIM_X 2
#define DIM_Y 2


__global__ void     kernel_name (int M, int N, float *C,float *A, float alpha, float beta)
{


    int idx = threadIdx.x;  // thread's m dimension
    int idy = threadIdx.y;  // thread's n dimension

    int blx = blockIdx.x;   // block's m dimension
    int bly = blockIdx.y;   // block's n dimension

//printf ( "idx,idy,blx,bly %d %d %d %d  \n",idx,idy,blx,bly);   //        --> bad  results
cuPrintf ( "idx,idy,blx,bly %d %d %d %d  \n",idx,idy,blx,bly);   //        --> good results

// Store C regs->dev
    #pragma unroll
    for (unsigned int n = 0; n < N; n++) {
        int coord_dCn = bly*BLK_N + n*DIM_Y+idy;
cuPrintf ( "n coord_dCn  %u %d   \n",n,coord_dCn);
        #pragma unroll
        for (unsigned int m = 0; m < M; m++) {
            int coord_dCm = blx*BLK_M + m*DIM_X+idx;
cuPrintf ( "n m   %u %u   \n",n,m);
            if (coord_dCm < M && coord_dCn < N) {
                int offsC = coord_dCn*4 + coord_dCm;
cuPrintf ( "coord_dCm offsC %d %d   \n",coord_dCm,offsC);

               float &regC = A[coord_dCm + coord_dCn];
                float &memC = C[offsC];
                memC = add(mul(alpha, regC), mul(beta, memC));
cuPrintf("&memC %x  memC  %10.4f offsC %3d C[offsC] %10.4f \n",&memC,memC,offsC,C[offsC]);
            }
        }
       
    }
}


void randomInit(float *data, int size)
{
    for (int i = 0; i < size; ++i){
    data[i] =  rand() / (float)RAND_MAX;}
}


int main(int argc, char **argv)
{
    int cuda_device = 0;


    cuda_device = findCudaDevice(argc, (const char **)argv);
    checkCudaErrors(cudaGetDevice(&cuda_device));
        size_t buf=1e6;
        cudaDeviceSetLimit(cudaLimitPrintfFifoSize,  buf);
        int M =4, N = 4;
        int msize = 16*sizeof(float);
// allocate host memory
    float *a = 0;                     // pointer to the array data in host memory
    float *c = 0;                     // pointer to the array data in host memory
    checkCudaErrors(cudaMallocHost((void **)&a, msize));
    checkCudaErrors(cudaMallocHost((void **)&c, msize));

    // allocate device memory
    float *d_a = 0;             // pointers to data and init value in the device memory
    float *d_c = 0;             // pointers to data and init value in the device memory
    checkCudaErrors(cudaMalloc((void **)&d_a, msize));
    checkCudaErrors(cudaMalloc((void **)&d_c, msize));



    dim3 bloc(BLK_N,BLK_M,1);

    dim3 grid(M/ bloc.x, N / bloc.y);

    // initialize host memory

    randomInit( a,M*N );
for ( int i = 0; i < M*N ; i++)
printf ( "i %d a[i] %f \n",i,a[i]);


cudaPrintfInit();
// copy host memory to device

        cudaMemcpyAsync(d_a, a, msize, cudaMemcpyHostToDevice);


       kernel_name<<< grid, bloc, 0>>>(M,N,  d_c,d_a,  1.1f, 0.10f);


        cudaMemcpyAsync(c,d_c,msize,cudaMemcpyDeviceToHost) ;

cudaPrintfDisplay(stdout,false);

for ( int i = 0; i < M*N ; i++)
printf ( "i %d c[i] %f \n",i,c[i]);

cudaPrintfEnd();

   cudaFreeHost(a);
    cudaFree(d_a);
   cudaFreeHost(c);
    cudaFree(d_c);

    cudaDeviceReset();


}

Topic		Replies	Views
Can I print-to-file from a kernel? CUDA Programming and Performance cuda	10	3952	September 29, 2020
just for fun! my own implementation of 'cuPrintf()' enabling output debug message from k CUDA Programming and Performance	3	2547	March 31, 2010
Printf does not work in emulation mode /tmp/xxxxxxxx_stub.c: no such file or directory CUDA Programming and Performance	7	2406	February 1, 2010
cuPrintf released to registered developers CUDA Programming and Performance	13	13970	June 16, 2014
strange behavior writing to global memory CUDA Programming and Performance	7	871	January 29, 2020
is there any limit of printf buffer in a kernel? CUDA Programming and Performance	1	2068	February 12, 2014
No printf(".") output from the kernel CUDA Programming and Performance	6	2585	March 15, 2023
An sprintf() which works in your kernel? It's almost here, help beta-test it CUDA Programming and Performance tools , debugging-and-troubleshooting	5	1669	October 3, 2023
Problems with cudaMalloc(), on printf() just in the kernelcode CUDA Programming and Performance	6	5106	November 29, 2010
printf inside a kernel is not working nVIDIA Quadro 4000 CUDA Programming and Performance	2	3708	November 7, 2011

printf vs cuPrintf in kernels

Related topics