is there any limit of printf buffer in a kernel?

Hi there,
I am debugging a code and need to print out the block index used in the kernel. Here is my code

#include <cuda.h>
#include <iostream>
#include "cuPrintf.cu"

using namespace std;

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
 /* Check synchronous errors, i.e. pre-launch */ \
 cudaError_t err = cudaGetLastError(); \
 if (cudaSuccess != err) { \
 fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 __FILE__, __LINE__, cudaGetErrorString(err) ); \
 exit(EXIT_FAILURE); \
 } \
 /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
 err = cudaThreadSynchronize(); \
 if (cudaSuccess != err) { \
 fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
 __FILE__, __LINE__, cudaGetErrorString( err) ); \
 exit(EXIT_FAILURE); \
 } \
} while (0)

int InitGPUSet()  
{  
  char GPU[100] = "GPU: ";  
  cudaDeviceProp tCard;  
  int num = 0;  
  if (cudaSuccess == cudaGetDeviceCount(&num))  
  {  
    for (int i = 0; i < num; ++ i)  
    {  
      cudaSetDevice(i);  
      cudaGetDeviceProperties(&tCard, i);  
      puts(strcat(GPU, tCard.name));
    }
   }  
   else return 0;  
   return 1;  
}

bool cuPrintInit()
{  
    cudaError_t err = cudaPrintfInit();  
    if(0 != strcmp("no error", cudaGetErrorString(err)))  return false;  
    return true;  
} 

__global__ void test(void)
{
  printf("blockIdx.x=%d blockIdx.y=%d\n", blockIdx.x, blockIdx.y);
}


int main(void)
{
  if(!InitGPUSet())  puts("device is not ready!");  
  else if(!cuPrintInit())  puts("device is not ready!");  
  else  
  {  
    CHECK_LAUNCH_ERROR();
    test<<<dim3(600, 600) , dim3(2, 2)>>>();
    CHECK_LAUNCH_ERROR();
    cudaPrintfDisplay(stdout, true);
    cudaPrintfEnd();
    CHECK_LAUNCH_ERROR();
  }
}

It seems that there is a limit of output buffer. Above code doesn’t show all outputs but just some line with blockIdx.y close to the last few values. So what’s the correct way to print out all possible value for blockIdx.x and blockIdx.y in a kernel? Thanks.

The device-side printf() in CUDA uses a circular buffer of fixed size. You can set this size with a call to cudaDeviceSetLimit() using the cudaLimitPrintfFifoSize parameter. See the CUDA Runtime API manual for details.