Bad blockIdx.x when profiling with nvvp

This is a follow-up to:

Installing CUDA 12.6 resolved the issue in that post for my full program when run in Visual Studio.

A possibly related issue, that involves nvvp, is illustrated by this small complete program:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <string>
#include <iostream>
#include <iomanip>

#include "cuda_profiler_api.h"



#define          AllowCDP           1          // hard CDP enable  --  1 --> allow fire-and-forget and tail launches, 0 --> do not include them in the compile at all
const  int32_t   N_TailLaunches  =  0;         // soft number of k_root() self tail launches  --  0 --> no tail launches 
const  bool      EnableFAF       =  false;     // soft fire-and-forget launch enable


/* optional: to see the SM ID, uncomment this and change the printf in PrintIfBadBlockIdx()
__device__  __noinline__  static  uint32_t GetSM_ID() {
  uint32_t ret;
  asm  volatile ("mov.u32 %0, %%smid;" : "=r"(ret) );
  return  ret;
}
*/


__device__  __noinline__  bool PrintIfBadBlockIdx( const char* AppendStr = "", bool AlwaysPrint = false ) {
  const uint32_t   N_Blocks     =    gridDim.x;
  const uint32_t   BlkIndex     =   blockIdx.x;
  const uint32_t   ThrIndex     =  threadIdx.x;
  const bool       LeadThr      =  ThrIndex  ==  0;
  bool             BadIdx       =  false;                        // all threads
  if ( LeadThr ) {
                   BadIdx       =  BlkIndex  >=  N_Blocks;       // LeadThr only
    const char*    BadStr       =  BadIdx  ?  " <<< !!!"  :  "";
    if (           BadIdx      ||  AlwaysPrint ) {
//    printf( "\n  ***** Block %3u of %3u, SM = %3u *****  %s %s", blockIdx.x, gridDim.x, GetSM_ID(), AppendStr, BadStr );     // to see the SM ID
      printf( "\n  ***** Block %3u of %3u *****  %s %s"          , blockIdx.x, gridDim.x,             AppendStr, BadStr );     // using this instead of the line above does not change the reported errors
    }
  }
  return  BadIdx;  // threads other than thread 0 always return false
}


// return true and print message if CUDA error detected, e.g., call this function after any kernel launch by device code
__device__  inline  bool    CheckErrorCUDA( const char* Msg ) {
  cudaError_t ce   =  cudaGetLastError();
  if (        ce  ==  cudaSuccess )  return  false;
  printf( "\n***** CUDA device error %d after %s. *****", ce, Msg );
  return  true;
}


// kernel that is launched into fire-and-forget streams
__global__  void  k_fire() {
  if ( threadIdx.x  ==  0 ) {
    printf(      "\nk_fire() Block %3u", blockIdx.x );
  }
  PrintIfBadBlockIdx( "k_fire()" );

  for ( int i = 0; i < 1000; ++ i ) {
    if ( ( clock()  &  0xfff )  ==  0x800 )  break;    // wait a random interval
  }
}


// kernel that is launched by host or self tail-launched
__global__  void  k_root( int32_t aRun, int32_t N_Relaunch, bool LaunchFAF ) {
//const uint32_t   N_Blocks     =    gridDim.x;
  const uint32_t   BlkIndex     =   blockIdx.x;
  const uint32_t   ThrIndex     =  threadIdx.x;
  const bool       LeadThr      =  ThrIndex  ==  0;
  const bool       LeadBlk      =  BlkIndex  ==  0;

  if ( threadIdx.x  ==  0 ) {
//  printf(      "\nk_root( %4d, %4d ), Block %3u", aRun, N_Relaunch, blockIdx.x );
  }


  if ( PrintIfBadBlockIdx(    "k_root()<pre -FAF>" ) ) {
//  printf( " in k_root( %3d, %2d, %s )<pre -FAF>", aRun, N_Relaunch, LaunchFAF ? "w/ FAF" : "no FAF" );    // to see details of launch with bad blockIdx.x
  }

  __syncthreads();


// fire-and-forget launches
#if  AllowCDP
  if ( LeadBlk  &&  LaunchFAF  &&  ( ThrIndex < 3 ) ) {              // using 1 instead of 3 does not significantly change reported errors
  //k_fire<<< dim_grid, dim_block, nshared,                  stream >>> (
    k_fire<<<        5,       256,  8*1024, cudaStreamFireAndForget >>> ();
    CheckErrorCUDA( "k_root()'s k_fire() FAF launch" );
  }
#endif


  __syncthreads();
  for ( volatile int i = 0; i < 5000; ++ i ) clock();   // delay
/*     // this results in the same reports as the pre-FAF version above -- it is commented out to reduce clutter in the output
  if ( PrintIfBadBlockIdx(    "k_root()<post-FAF>" ) ) {
    printf( " in k_root( %3d, %2d, %s )<post-FAF>", aRun, N_Relaunch, LaunchFAF ? "w/ FAF" : "no FAF" );
  }
*/

  __syncthreads();

// tail launch
#if     AllowCDP
  if ( LeadBlk  &&  LeadThr  &&  ( N_Relaunch  >= 1 ) ) {
    printf( "\nk_root() tail launch with N_Relaunch = %2d.", N_Relaunch );

  //k_root<<< dim_grid, dim_block, nshared,               stream >>> (
    k_root<<<  gridDim,  blockDim,  8*1024, cudaStreamTailLaunch >>> ( aRun, N_Relaunch - 1, LaunchFAF );
    CheckErrorCUDA( "k_root()'s k_root() tail launch" );
  }
#endif

}

int32_t  randval  = 12345678;
int my_rand() { return  randval ++; }  //  rand(); }


int main() {

  cudaError_t   ce;

//cudaProfilerStart();     // this is now done in the run loop below

  ce = cudaSetDevice( 0 );
  if ( ce != cudaSuccess ) {
    std::cout  << "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?";    return 1;
  }

  cudaStream_t  Stream1;
  cudaStream_t  Stream2;
       ce  =  cudaStreamCreateWithPriority( & Stream1, cudaStreamNonBlocking, -2 );
  if ( ce != cudaSuccess ) {
    std::cout  << "cudaStreamCreateWithPriority() failed!";    return 1;
  }
       ce  =  cudaStreamCreateWithPriority( & Stream2, cudaStreamNonBlocking, -1 );
  if ( ce != cudaSuccess ) {
    std::cout  << "cudaStreamCreateWithPriority() failed!";    return 1;
  }


  for ( int32_t Run  =  0; Run < 250; ++ Run ) {

    if ( Run  ==  75 )  cudaProfilerStart();
    if ( Run  == 150 )  cudaProfilerStop();
 

// root launch into Stream1
    bool       FAF   =  EnableFAF  &&  ( my_rand() & 0x40 );
    uint32_t  N_Blk  =                 ( my_rand() & 0x1f ) + 1;        // change the mask to see the effect on reported bad block IDs

  //k_root<<< dim_grid, dim_block,    nshared, stream  >>>(
    k_root<<<    N_Blk,        64,          0, Stream1 >>>( Run, N_TailLaunches, FAF    );
//  k_root<<<        1,        64,          0, Stream1 >>>( Run, N_TailLaunches, false  );


// root launch into Stream2
               FAF   =  EnableFAF  &&  ( my_rand() & 0x40 );
              N_Blk  =                 ( my_rand() & 0x0f ) + 1;  

  //k_root<<< dim_grid, dim_block,    nshared, stream  >>>(
    k_root<<<    N_Blk,       256,          0, Stream2 >>>( Run, N_TailLaunches, FAF   );
//  k_root<<<       14,       256,          0, Stream2 >>>( Run, N_TailLaunches, false );
 
         ce  = cudaGetLastError();
    if ( ce != cudaSuccess ) {
      std::cout  << "k_root() kernel launch failed: " << cudaGetErrorString(ce);    return 1;
    }


    if ( ( Run %  25 )  ==  0 ) {
           ce  = cudaDeviceSynchronize();
      if ( ce != cudaSuccess ) {
        std::cout  << "cudaDeviceSynchronize() failed: " << cudaGetErrorString(ce);    return 1;
      }
      std::cout  << "\nFinished run " << std::setw( 5 ) << Run << std::flush;
    }
 
  }  // end of runs for loop

       ce  = cudaDeviceSynchronize();
  if ( ce != cudaSuccess ) {
    std::cout  << "cudaDeviceSynchronize() failed: " << cudaGetErrorString(ce);    return 1;
  }

  // cudaDeviceReset must be called before exiting in order for profiling and
  // tracing tools such as Nsight and Visual Profiler to show complete traces.
  ce = cudaDeviceReset();
  if ( ce != cudaSuccess ) {
    std::cout  << "cudaDeviceReset failed!";
    return 1;
  }
 
  std::cout    << "\nDone.\n\n" << std::flush;
}

When run from Visual Studio, the console output is what I expect:

Finished run     0
Finished run    25
Finished run    50
Finished run    75
Finished run   100
Finished run   125
Finished run   150
Finished run   175
Finished run   200
Finished run   225
Done.

The nvvp console output (abridged), with profiling initially disabled, is shown below.
Notes:

  • The program enables profiling for runs ~75-~150 only.
  • The first three error reports are likely from run 75.
  • Output like ***** Block 13 of 6 ... !!! indicates that the program detected a blockIdx.value (13) that was greater or equal to the gridDim.x value (6).
Finished run     0
Finished run    25
Finished run    50
  ***** Block  13 of   6 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  12 of   6 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block   7 of   6 *****  k_root()<pre -FAF>  <<< !!!
Finished run    75
  ***** Block  12 of   8 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  13 of   8 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  16 of   8 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  14 of   8 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  17 of   8 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  15 of   8 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  19 of   9 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  18 of   9 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  16 of  10 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  14 of  10 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  10 of  10 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  15 of  10 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  10 of  10 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  11 of  10 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  12 of  12 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  13 of  12 *****  k_root()<pre -FAF>  <<< !!!
...
Finished run   100
  ***** Block  12 of  10 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  28 of  27 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  28 of  27 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  12 of  12 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block   3 of   3 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block   7 of   5 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  12 of   6 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block   7 of   7 *****  k_root()<pre -FAF>  <<< !!!
...
Finished run   125
  ***** Block  13 of  11 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  14 of  12 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  19 of  12 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  16 of  12 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  17 of  12 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  20 of  14 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  15 of  14 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  21 of  14 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  19 of  14 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  20 of  16 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  18 of  16 *****  k_root()<pre -FAF>  <<< !!!
  ***** Block  21 of  16 *****  k_root()<pre -FAF>  <<< !!!
...
Finished run   150
Finished run   175
Finished run   200
Finished run   225
Done.

Note that the program is set up so that it never launches kernels by CDP.
This can be changed in the first few lines of the program (N_TailLaunches and/or EnableFAF).

If AllowCDP is defined as 0 instead of 1, the reports of bad blockIdx.x do not appear.
It seems that having CDP code in the program is enough to trigger the issue, even if the running code never launches kernels from the device.

The range of bad block indexes observed can be changed by changing the mask in the line below.
E.g., changing the mask from 0x1f to 0x3f approximately doubles the range of bad blockIdx.x values reported.

uint32_t  N_Blk  =                 ( my_rand() & 0x1f ) + 1;        // change the mask to see the effect on reported bad block IDs

It seems that sometimes a kernel sees illegal blockIdx.x values that would be legal for a different grid launched by the program.

Only host-launched kernels seem to report bad blockIdx.x values.
When I enable fire-and-forget and/or tail launches, I never see reports of bad blockIdx.x values from the device-launched kernels, only the host-launched kernels.

I have a GTX 1070 Ti that is not used for display.
Windows 10, Visual Studio 2017, CUDA 12.6.
Visual Profiler version12.6.
The code is compiled as relocatable.
The SASS for PrintIfBadBlockIdx() is available in the post linked above.

C:\Users\Steve\source\repos\Expers\Exper05_VolatileAtomic>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin\nvcc.exe" -gencode=arch=compute_61,code=\"sm_61,compute_61\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\VC\Tools\MSVC\14.16.27023\bin\HostX86\x64" -x cu -rdc=true  -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\include"  -G  --keep --keep-dir x64\Debug  -maxrregcount=0   --machine 64 --compile -cudart static -src-in-ptx -g  -DWIN32 -DWIN64 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /FS /Zi /RTC1 /MDd " -Xcompiler "/Fdx64\Debug\vc141.pdb" -o C:\Users\Steve\source\repos\Expers\Exper05_VolatileAtomic\x64\Debug\main_exper05.cu.obj "C:\Users\Steve\source\repos\Expers\Exper05_VolatileAtomic\main_exper05.cu"

Is there an explanation for what is going on?
I wonder if my GPU is starting to fail.