This is a follow-up to:
Installing CUDA 12.6 resolved the issue in that post for my full program when run in Visual Studio.
A possibly related issue, that involves nvvp, is illustrated by this small complete program:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <string>
#include <iostream>
#include <iomanip>
#include "cuda_profiler_api.h"
#define AllowCDP 1 // hard CDP enable -- 1 --> allow fire-and-forget and tail launches, 0 --> do not include them in the compile at all
const int32_t N_TailLaunches = 0; // soft number of k_root() self tail launches -- 0 --> no tail launches
const bool EnableFAF = false; // soft fire-and-forget launch enable
/* optional: to see the SM ID, uncomment this and change the printf in PrintIfBadBlockIdx()
__device__ __noinline__ static uint32_t GetSM_ID() {
uint32_t ret;
asm volatile ("mov.u32 %0, %%smid;" : "=r"(ret) );
return ret;
}
*/
__device__ __noinline__ bool PrintIfBadBlockIdx( const char* AppendStr = "", bool AlwaysPrint = false ) {
const uint32_t N_Blocks = gridDim.x;
const uint32_t BlkIndex = blockIdx.x;
const uint32_t ThrIndex = threadIdx.x;
const bool LeadThr = ThrIndex == 0;
bool BadIdx = false; // all threads
if ( LeadThr ) {
BadIdx = BlkIndex >= N_Blocks; // LeadThr only
const char* BadStr = BadIdx ? " <<< !!!" : "";
if ( BadIdx || AlwaysPrint ) {
// printf( "\n ***** Block %3u of %3u, SM = %3u ***** %s %s", blockIdx.x, gridDim.x, GetSM_ID(), AppendStr, BadStr ); // to see the SM ID
printf( "\n ***** Block %3u of %3u ***** %s %s" , blockIdx.x, gridDim.x, AppendStr, BadStr ); // using this instead of the line above does not change the reported errors
}
}
return BadIdx; // threads other than thread 0 always return false
}
// return true and print message if CUDA error detected, e.g., call this function after any kernel launch by device code
__device__ inline bool CheckErrorCUDA( const char* Msg ) {
cudaError_t ce = cudaGetLastError();
if ( ce == cudaSuccess ) return false;
printf( "\n***** CUDA device error %d after %s. *****", ce, Msg );
return true;
}
// kernel that is launched into fire-and-forget streams
__global__ void k_fire() {
if ( threadIdx.x == 0 ) {
printf( "\nk_fire() Block %3u", blockIdx.x );
}
PrintIfBadBlockIdx( "k_fire()" );
for ( int i = 0; i < 1000; ++ i ) {
if ( ( clock() & 0xfff ) == 0x800 ) break; // wait a random interval
}
}
// kernel that is launched by host or self tail-launched
__global__ void k_root( int32_t aRun, int32_t N_Relaunch, bool LaunchFAF ) {
//const uint32_t N_Blocks = gridDim.x;
const uint32_t BlkIndex = blockIdx.x;
const uint32_t ThrIndex = threadIdx.x;
const bool LeadThr = ThrIndex == 0;
const bool LeadBlk = BlkIndex == 0;
if ( threadIdx.x == 0 ) {
// printf( "\nk_root( %4d, %4d ), Block %3u", aRun, N_Relaunch, blockIdx.x );
}
if ( PrintIfBadBlockIdx( "k_root()<pre -FAF>" ) ) {
// printf( " in k_root( %3d, %2d, %s )<pre -FAF>", aRun, N_Relaunch, LaunchFAF ? "w/ FAF" : "no FAF" ); // to see details of launch with bad blockIdx.x
}
__syncthreads();
// fire-and-forget launches
#if AllowCDP
if ( LeadBlk && LaunchFAF && ( ThrIndex < 3 ) ) { // using 1 instead of 3 does not significantly change reported errors
//k_fire<<< dim_grid, dim_block, nshared, stream >>> (
k_fire<<< 5, 256, 8*1024, cudaStreamFireAndForget >>> ();
CheckErrorCUDA( "k_root()'s k_fire() FAF launch" );
}
#endif
__syncthreads();
for ( volatile int i = 0; i < 5000; ++ i ) clock(); // delay
/* // this results in the same reports as the pre-FAF version above -- it is commented out to reduce clutter in the output
if ( PrintIfBadBlockIdx( "k_root()<post-FAF>" ) ) {
printf( " in k_root( %3d, %2d, %s )<post-FAF>", aRun, N_Relaunch, LaunchFAF ? "w/ FAF" : "no FAF" );
}
*/
__syncthreads();
// tail launch
#if AllowCDP
if ( LeadBlk && LeadThr && ( N_Relaunch >= 1 ) ) {
printf( "\nk_root() tail launch with N_Relaunch = %2d.", N_Relaunch );
//k_root<<< dim_grid, dim_block, nshared, stream >>> (
k_root<<< gridDim, blockDim, 8*1024, cudaStreamTailLaunch >>> ( aRun, N_Relaunch - 1, LaunchFAF );
CheckErrorCUDA( "k_root()'s k_root() tail launch" );
}
#endif
}
int32_t randval = 12345678;
int my_rand() { return randval ++; } // rand(); }
int main() {
cudaError_t ce;
//cudaProfilerStart(); // this is now done in the run loop below
ce = cudaSetDevice( 0 );
if ( ce != cudaSuccess ) {
std::cout << "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"; return 1;
}
cudaStream_t Stream1;
cudaStream_t Stream2;
ce = cudaStreamCreateWithPriority( & Stream1, cudaStreamNonBlocking, -2 );
if ( ce != cudaSuccess ) {
std::cout << "cudaStreamCreateWithPriority() failed!"; return 1;
}
ce = cudaStreamCreateWithPriority( & Stream2, cudaStreamNonBlocking, -1 );
if ( ce != cudaSuccess ) {
std::cout << "cudaStreamCreateWithPriority() failed!"; return 1;
}
for ( int32_t Run = 0; Run < 250; ++ Run ) {
if ( Run == 75 ) cudaProfilerStart();
if ( Run == 150 ) cudaProfilerStop();
// root launch into Stream1
bool FAF = EnableFAF && ( my_rand() & 0x40 );
uint32_t N_Blk = ( my_rand() & 0x1f ) + 1; // change the mask to see the effect on reported bad block IDs
//k_root<<< dim_grid, dim_block, nshared, stream >>>(
k_root<<< N_Blk, 64, 0, Stream1 >>>( Run, N_TailLaunches, FAF );
// k_root<<< 1, 64, 0, Stream1 >>>( Run, N_TailLaunches, false );
// root launch into Stream2
FAF = EnableFAF && ( my_rand() & 0x40 );
N_Blk = ( my_rand() & 0x0f ) + 1;
//k_root<<< dim_grid, dim_block, nshared, stream >>>(
k_root<<< N_Blk, 256, 0, Stream2 >>>( Run, N_TailLaunches, FAF );
// k_root<<< 14, 256, 0, Stream2 >>>( Run, N_TailLaunches, false );
ce = cudaGetLastError();
if ( ce != cudaSuccess ) {
std::cout << "k_root() kernel launch failed: " << cudaGetErrorString(ce); return 1;
}
if ( ( Run % 25 ) == 0 ) {
ce = cudaDeviceSynchronize();
if ( ce != cudaSuccess ) {
std::cout << "cudaDeviceSynchronize() failed: " << cudaGetErrorString(ce); return 1;
}
std::cout << "\nFinished run " << std::setw( 5 ) << Run << std::flush;
}
} // end of runs for loop
ce = cudaDeviceSynchronize();
if ( ce != cudaSuccess ) {
std::cout << "cudaDeviceSynchronize() failed: " << cudaGetErrorString(ce); return 1;
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
ce = cudaDeviceReset();
if ( ce != cudaSuccess ) {
std::cout << "cudaDeviceReset failed!";
return 1;
}
std::cout << "\nDone.\n\n" << std::flush;
}
When run from Visual Studio, the console output is what I expect:
Finished run 0
Finished run 25
Finished run 50
Finished run 75
Finished run 100
Finished run 125
Finished run 150
Finished run 175
Finished run 200
Finished run 225
Done.
The nvvp console output (abridged), with profiling initially disabled, is shown below.
Notes:
- The program enables profiling for runs ~75-~150 only.
- The first three error reports are likely from run 75.
- Output like
***** Block 13 of 6 ... !!!
indicates that the program detected a blockIdx.value (13) that was greater or equal to the gridDim.x value (6).
Finished run 0
Finished run 25
Finished run 50
***** Block 13 of 6 ***** k_root()<pre -FAF> <<< !!!
***** Block 12 of 6 ***** k_root()<pre -FAF> <<< !!!
***** Block 7 of 6 ***** k_root()<pre -FAF> <<< !!!
Finished run 75
***** Block 12 of 8 ***** k_root()<pre -FAF> <<< !!!
***** Block 13 of 8 ***** k_root()<pre -FAF> <<< !!!
***** Block 16 of 8 ***** k_root()<pre -FAF> <<< !!!
***** Block 14 of 8 ***** k_root()<pre -FAF> <<< !!!
***** Block 17 of 8 ***** k_root()<pre -FAF> <<< !!!
***** Block 15 of 8 ***** k_root()<pre -FAF> <<< !!!
***** Block 19 of 9 ***** k_root()<pre -FAF> <<< !!!
***** Block 18 of 9 ***** k_root()<pre -FAF> <<< !!!
***** Block 16 of 10 ***** k_root()<pre -FAF> <<< !!!
***** Block 14 of 10 ***** k_root()<pre -FAF> <<< !!!
***** Block 10 of 10 ***** k_root()<pre -FAF> <<< !!!
***** Block 15 of 10 ***** k_root()<pre -FAF> <<< !!!
***** Block 10 of 10 ***** k_root()<pre -FAF> <<< !!!
***** Block 11 of 10 ***** k_root()<pre -FAF> <<< !!!
***** Block 12 of 12 ***** k_root()<pre -FAF> <<< !!!
***** Block 13 of 12 ***** k_root()<pre -FAF> <<< !!!
...
Finished run 100
***** Block 12 of 10 ***** k_root()<pre -FAF> <<< !!!
***** Block 28 of 27 ***** k_root()<pre -FAF> <<< !!!
***** Block 28 of 27 ***** k_root()<pre -FAF> <<< !!!
***** Block 12 of 12 ***** k_root()<pre -FAF> <<< !!!
***** Block 3 of 3 ***** k_root()<pre -FAF> <<< !!!
***** Block 7 of 5 ***** k_root()<pre -FAF> <<< !!!
***** Block 12 of 6 ***** k_root()<pre -FAF> <<< !!!
***** Block 7 of 7 ***** k_root()<pre -FAF> <<< !!!
...
Finished run 125
***** Block 13 of 11 ***** k_root()<pre -FAF> <<< !!!
***** Block 14 of 12 ***** k_root()<pre -FAF> <<< !!!
***** Block 19 of 12 ***** k_root()<pre -FAF> <<< !!!
***** Block 16 of 12 ***** k_root()<pre -FAF> <<< !!!
***** Block 17 of 12 ***** k_root()<pre -FAF> <<< !!!
***** Block 20 of 14 ***** k_root()<pre -FAF> <<< !!!
***** Block 15 of 14 ***** k_root()<pre -FAF> <<< !!!
***** Block 21 of 14 ***** k_root()<pre -FAF> <<< !!!
***** Block 19 of 14 ***** k_root()<pre -FAF> <<< !!!
***** Block 20 of 16 ***** k_root()<pre -FAF> <<< !!!
***** Block 18 of 16 ***** k_root()<pre -FAF> <<< !!!
***** Block 21 of 16 ***** k_root()<pre -FAF> <<< !!!
...
Finished run 150
Finished run 175
Finished run 200
Finished run 225
Done.
Note that the program is set up so that it never launches kernels by CDP.
This can be changed in the first few lines of the program (N_TailLaunches and/or EnableFAF).
If AllowCDP is defined as 0 instead of 1, the reports of bad blockIdx.x do not appear.
It seems that having CDP code in the program is enough to trigger the issue, even if the running code never launches kernels from the device.
The range of bad block indexes observed can be changed by changing the mask in the line below.
E.g., changing the mask from 0x1f to 0x3f approximately doubles the range of bad blockIdx.x values reported.
uint32_t N_Blk = ( my_rand() & 0x1f ) + 1; // change the mask to see the effect on reported bad block IDs
It seems that sometimes a kernel sees illegal blockIdx.x values that would be legal for a different grid launched by the program.
Only host-launched kernels seem to report bad blockIdx.x values.
When I enable fire-and-forget and/or tail launches, I never see reports of bad blockIdx.x values from the device-launched kernels, only the host-launched kernels.
I have a GTX 1070 Ti that is not used for display.
Windows 10, Visual Studio 2017, CUDA 12.6.
Visual Profiler version12.6.
The code is compiled as relocatable.
The SASS for PrintIfBadBlockIdx() is available in the post linked above.
C:\Users\Steve\source\repos\Expers\Exper05_VolatileAtomic>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin\nvcc.exe" -gencode=arch=compute_61,code=\"sm_61,compute_61\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\VC\Tools\MSVC\14.16.27023\bin\HostX86\x64" -x cu -rdc=true -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\include" -G --keep --keep-dir x64\Debug -maxrregcount=0 --machine 64 --compile -cudart static -src-in-ptx -g -DWIN32 -DWIN64 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /FS /Zi /RTC1 /MDd " -Xcompiler "/Fdx64\Debug\vc141.pdb" -o C:\Users\Steve\source\repos\Expers\Exper05_VolatileAtomic\x64\Debug\main_exper05.cu.obj "C:\Users\Steve\source\repos\Expers\Exper05_VolatileAtomic\main_exper05.cu"
Is there an explanation for what is going on?
I wonder if my GPU is starting to fail.