Run below simple kernel in H100 GPU. I get runtime error from cudaGetLastError() and cudaDeviceSynchronize().
The CUDA error says: “an illegal instruction was encountered”.
But if I remove printf(), the kernel can successfully run. Is there any constrain between stenmaxreg and printf function call ?
template<uint32_t RegCount>
__device__ __inline__ void warpgroup_reg_alloc(){
asm volatile( "setmaxnreg.inc.sync.aligned.u32 %0;\n" : : "n"(RegCount) );
}
template<uint32_t RegCount>
__device__ __inline__ void warpgroup_reg_dealloc(){
asm volatile( "setmaxnreg.dec.sync.aligned.u32 %0;\n" : : "n"(RegCount) );
}
__global__
__launch_bounds__(768, 1)
void divergentKernelError(int *input, int *output) {
int thread_id = blockIdx.x*blockDim.x + threadIdx.x;
int seed = *(input + thread_id);
int warpgroup_id = thread_id/128;
int scratch_data[BATCH_SIZE];
if((warpgroup_id&0x01) == 0){
// even warp group do nothing, only write pattern data, sacrifice its register resource
warpgroup_reg_dealloc<24>();
printf("decrease register \n");
}else{
// odd warp group get enough register, and do compute
warpgroup_reg_alloc<128>(); // 80+80-24 = 136
printf("increase register \n");
}
}