is there any limit of printf buffer in a kernel?

Kilong · February 12, 2014, 6:01am

Hi there,
I am debugging a code and need to print out the block index used in the kernel. Here is my code

#include <cuda.h>
#include <iostream>
#include "cuPrintf.cu"

using namespace std;

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
 /* Check synchronous errors, i.e. pre-launch */ \
 cudaError_t err = cudaGetLastError(); \
 if (cudaSuccess != err) { \
 fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 __FILE__, __LINE__, cudaGetErrorString(err) ); \
 exit(EXIT_FAILURE); \
 } \
 /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
 err = cudaThreadSynchronize(); \
 if (cudaSuccess != err) { \
 fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 __FILE__, __LINE__, cudaGetErrorString( err) ); \
 exit(EXIT_FAILURE); \
 } \
} while (0)

int InitGPUSet()  
{  
  char GPU[100] = "GPU: ";  
  cudaDeviceProp tCard;  
  int num = 0;  
  if (cudaSuccess == cudaGetDeviceCount(&num))  
  {  
    for (int i = 0; i < num; ++ i)  
    {  
      cudaSetDevice(i);  
      cudaGetDeviceProperties(&tCard, i);  
      puts(strcat(GPU, tCard.name));
    }
   }  
   else return 0;  
   return 1;  
}

bool cuPrintInit()
{  
    cudaError_t err = cudaPrintfInit();  
    if(0 != strcmp("no error", cudaGetErrorString(err)))  return false;  
    return true;  
} 

__global__ void test(void)
{
  printf("blockIdx.x=%d blockIdx.y=%d\n", blockIdx.x, blockIdx.y);
}


int main(void)
{
  if(!InitGPUSet())  puts("device is not ready!");  
  else if(!cuPrintInit())  puts("device is not ready!");  
  else  
  {  
    CHECK_LAUNCH_ERROR();
    test<<<dim3(600, 600) , dim3(2, 2)>>>();
    CHECK_LAUNCH_ERROR();
    cudaPrintfDisplay(stdout, true);
    cudaPrintfEnd();
    CHECK_LAUNCH_ERROR();
  }
}

It seems that there is a limit of output buffer. Above code doesn’t show all outputs but just some line with blockIdx.y close to the last few values. So what’s the correct way to print out all possible value for blockIdx.x and blockIdx.y in a kernel? Thanks.

njuffa · February 12, 2014, 7:32am

The device-side printf() in CUDA uses a circular buffer of fixed size. You can set this size with a call to cudaDeviceSetLimit() using the cudaLimitPrintfFifoSize parameter. See the CUDA Runtime API manual for details.

Topic		Replies	Views
No printf(".") output from the kernel CUDA Programming and Performance	6	2072	March 15, 2023
printf vs cuPrintf in kernels CUDA Programming and Performance	2	5816	February 5, 2013
Can I print-to-file from a kernel? CUDA Programming and Performance cuda	10	3658	September 29, 2020
Limited output of kernel printf CUDA Programming and Performance	4	2293	September 1, 2011
just for fun! my own implementation of 'cuPrintf()' enabling output debug message from k CUDA Programming and Performance	3	2542	March 31, 2010
notification of printf cudaLimitPrintfFifoSize buffer exceeded? CUDA Programming and Performance	0	637	June 23, 2014
for loop inside kernel CUDA Programming and Performance	2	5368	September 12, 2011
help to clairfy usage of number of grids and number of blocks in kernal CUDA Programming and Performance	0	611	February 14, 2014
Size of printf buffer CUDA Programming and Performance	4	1971	December 2, 2020
printf in cuda kernel CUDA Programming and Performance	1	1211	November 26, 2019

is there any limit of printf buffer in a kernel?

Related topics