I played around a bit with your code and believe that this is an issue with internal shared memory misconfiguration. Multi-warp tiles will make use of the 1kb reserved shared memor per block. for sm 80 and newer.
I modified your code to print some special registers which are configured for the driver shared memory, and added non-graph kernel launches at the end to compare graph shared memory configuration to traditional configuration.
#include <assert.h>
#include <climits>
#include <stdio.h>
#include <vector>
#include <cuda_runtime.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void squareArray(uint32_t s)
{
int reserved_smem_offset_begin;
int reserved_smem_offset_end;
int reserved_smem_offset_cap;
int reserved_smem_offset_0;
int reserved_smem_offset_1;
unsigned int total_smem_size;
unsigned int aggr_smem_size;
asm("mov.b32 %0, %reserved_smem_offset_begin;"
"mov.b32 %1, %reserved_smem_offset_end;"
"mov.b32 %2, %reserved_smem_offset_cap;"
"mov.b32 %3, %reserved_smem_offset_0;"
"mov.b32 %4, %reserved_smem_offset_1;"
"mov.u32 %5, %total_smem_size;" //size of user-declared shared memory
//"mov.u32 %6, %aggr_smem_size;" // size of user-declared + reserved shared memory. only available for >= sm_90
:
"=r"(reserved_smem_offset_begin),
"=r"(reserved_smem_offset_end),
"=r"(reserved_smem_offset_cap),
"=r"(reserved_smem_offset_0),
"=r"(reserved_smem_offset_1),
"=r"(total_smem_size)
//"=r"(aggr_smem_size)
);
if(threadIdx.x + blockIdx.x == 0){
printf("reserved_smem_offset_begin %d\n"
"reserved_smem_offset_end %d\n"
"reserved_smem_offset_cap %d\n"
"reserved_smem_offset_0 %d\n"
"reserved_smem_offset_1 %d\n"
"total_smem_size %d\n"
"aggr_smem_size %d\n",
reserved_smem_offset_begin,
reserved_smem_offset_end,
reserved_smem_offset_cap,
reserved_smem_offset_0,
reserved_smem_offset_1,
total_smem_size,
aggr_smem_size
);
}
extern __shared__ int shared_data[];
if(threadIdx.x + blockIdx.x == 0){
printf("dynamic smem begins at %lu\n", __cvta_generic_to_shared(shared_data));
}
// cg::thread_block currentThreadBlock = cg::this_thread_block();
// auto tile64 = cg::tiled_partition<64>(currentThreadBlock);
// if (tile64.thread_rank() == 0) {
// shared_data[0] = s;
// }
// tile64.sync();
// auto warpTile32 = cg::tiled_partition<32>(currentThreadBlock);
// warpTile32.sync();
// tile64.sync();
}
int main(int argc, char **argv)
{
cudaStream_t stream;
cudaGraph_t graph;
cudaGraphExec_t graphExec;
cudaGraphNode_t squareKernelNode;
cudaKernelNodeParams kernelNodeParams = {0};
uint32_t s = 123;
void *squareKernelArgs[1] = {&s};
gpuErrchk(cudaGraphCreate(&graph, 0));
gpuErrchk(cudaStreamCreate(&stream));
kernelNodeParams.func = (void *)squareArray;
kernelNodeParams.gridDim = dim3(1024, 1, 1);
kernelNodeParams.blockDim = dim3(128, 1, 1);
kernelNodeParams.sharedMemBytes = 2048;
kernelNodeParams.kernelParams = (void **)squareKernelArgs;
gpuErrchk(cudaGraphAddKernelNode(&squareKernelNode, graph, nullptr, 0, &kernelNodeParams));
gpuErrchk(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
gpuErrchk(cudaGraphExecKernelNodeSetParams(graphExec, squareKernelNode, &kernelNodeParams));
gpuErrchk(cudaGraphLaunch(graphExec, stream));
gpuErrchk(cudaStreamSynchronize(stream));
// Second launch (Updated shared memory)
kernelNodeParams.gridDim = dim3(1024, 1, 1);
kernelNodeParams.sharedMemBytes = 1024; // FIX: REPLACE WITH 2048 - same as the previous graph launch
gpuErrchk(cudaGraphExecKernelNodeSetParams(graphExec, squareKernelNode, &kernelNodeParams));
gpuErrchk(cudaGraphLaunch(graphExec, stream));
gpuErrchk(cudaStreamSynchronize(stream));
gpuErrchk(cudaStreamDestroy(stream));
gpuErrchk(cudaGraphExecDestroy(graphExec));
printf("Graph executions completed.\n");
squareArray<<<1024,128,2048>>>(s);
squareArray<<<1024,128,1024>>>(s);
cudaDeviceSynchronize();
return 0;
}
With driver 580.95.05 and CUDA 13.0, I get the following output for sm_86 and sm_89, which produce the illegal memory access that you report.
reserved_smem_offset_begin 2048
reserved_smem_offset_end 2336
reserved_smem_offset_cap 3072
reserved_smem_offset_0 2048
reserved_smem_offset_1 2048
total_smem_size 2048
aggr_smem_size 0
dynamic smem begins at 0
reserved_smem_offset_begin 1024
reserved_smem_offset_end 2336
reserved_smem_offset_cap 2048
reserved_smem_offset_0 1024
reserved_smem_offset_1 2048
total_smem_size 1024
aggr_smem_size 0
dynamic smem begins at 0
Graph executions completed.
reserved_smem_offset_begin 2048
reserved_smem_offset_end 2336
reserved_smem_offset_cap 3072
reserved_smem_offset_0 2048
reserved_smem_offset_1 2048
total_smem_size 2048
aggr_smem_size 0
dynamic smem begins at 0
reserved_smem_offset_begin 1024
reserved_smem_offset_end 1312
reserved_smem_offset_cap 2048
reserved_smem_offset_0 1024
reserved_smem_offset_1 1024
total_smem_size 1024
aggr_smem_size 0
dynamic smem begins at 0
Note that reserved shared memory is located after dynamic shared memory, but more importantly, output does not match between the updated kernel in graph, and the standalone kernel with smem 1024.
On the otherhand, sm_120 will not produce illegal memory access, and its configuration output is consistent. (Also, reserved shared memory is located before dynamic shared memory.)
reserved_smem_offset_begin 0
reserved_smem_offset_end 288
reserved_smem_offset_cap 1024
reserved_smem_offset_0 64
reserved_smem_offset_1 0
total_smem_size 2048
aggr_smem_size 0
dynamic smem begins at 1024
reserved_smem_offset_begin 0
reserved_smem_offset_end 288
reserved_smem_offset_cap 1024
reserved_smem_offset_0 64
reserved_smem_offset_1 0
total_smem_size 1024
aggr_smem_size 0
dynamic smem begins at 1024
Graph executions completed.
reserved_smem_offset_begin 0
reserved_smem_offset_end 288
reserved_smem_offset_cap 1024
reserved_smem_offset_0 64
reserved_smem_offset_1 0
total_smem_size 2048
aggr_smem_size 0
dynamic smem begins at 1024
reserved_smem_offset_begin 0
reserved_smem_offset_end 288
reserved_smem_offset_cap 1024
reserved_smem_offset_0 64
reserved_smem_offset_1 0
total_smem_size 1024
aggr_smem_size 0
dynamic smem begins at 1024
I would suggesting filing a bug report , see How to report a bug