I have some CUDA code that launches a simple graph with only a single kernelNode and logs the block start and end times, as well as the SM IDs. Strangely, the output suggests that every other block is being assigned to SM 0 (all at the same time). This should not be possible, however, since the blocks have 1024 threads each and my RTX 4050 has a 1536 thread per SM limit (right?).
#include <stdio.h>
#include <cuda_runtime.h>
#include <vector>
#include "helper_cuda.h"
#include "benchmark_gpu_utilities.h"
__global__ void kernel_A(uint64_t *block_times, uint64_t *block_smids){
uint64_t start_time = GlobalTimer64();
// First, record the kernel and block start times
// and block SM iDs
if (threadIdx.x == 0) {
block_times[blockIdx.x * 2] = start_time;
block_smids[blockIdx.x] = GetSMID();
}
// Spin for 1 second
while ((GlobalTimer64()-start_time)<(100*1000*1000)){
continue;
}
// Record the kernel and block end times.
if (threadIdx.x == 0) {
block_times[blockIdx.x * 2 + 1] = GlobalTimer64();
}
}
void setKernelNodeParams(cudaKernelNodeParams *kernelNodeParams, int blocksPerGrid, int threadsPerBlock, uint64_t **block_times_d, uint32_t **block_smids_d){
void **kernelArgs = (void **) malloc(sizeof(void*)*2);
kernelArgs[0] = (void *) block_times_d;
kernelArgs[1] = block_smids_d;
kernelNodeParams->gridDim = dim3(blocksPerGrid, 1, 1);
kernelNodeParams->blockDim = dim3(threadsPerBlock, 1, 1);
kernelNodeParams->sharedMemBytes = 0;
kernelNodeParams->kernelParams = kernelArgs;
kernelNodeParams->extra = NULL;
}
int main(){
// Initializations
int blocksPerGrid = 5;
int threadsPerBlock = 1024;
cudaStream_t streamForGraph;
checkCudaErrors(cudaStreamCreate(&streamForGraph));
cudaGraph_t graph;
cudaGraphExec_t instance;
// Kernel node
cudaGraphNode_t kernelNode = (cudaGraphNode_t) malloc(sizeof(cudaGraphNode_t));
uint64_t* block_times_d;
uint64_t* block_times_h;
uint32_t* block_smids_d;
uint32_t* block_smids_h;
cudaKernelNodeParams kernelNodeParams;
cudaMalloc(&block_times_d, sizeof(uint64_t)*blocksPerGrid*2);
cudaMalloc(&block_smids_d, sizeof(uint32_t)*blocksPerGrid);
cudaMallocHost(&block_times_h, sizeof(uint64_t)*blocksPerGrid*2);
cudaMallocHost(&block_smids_h, sizeof(uint32_t)*blocksPerGrid);
setKernelNodeParams(&kernelNodeParams, blocksPerGrid, threadsPerBlock, &block_times_d, &block_smids_d);
kernelNodeParams.func = (void *) kernel_A;
// Create graph and add nodes
checkCudaErrors(cudaGraphCreate(&graph, 0));
checkCudaErrors(cudaGraphAddKernelNode(&kernelNode, graph, NULL, 0, &kernelNodeParams));
// Instantiate and launch graph
checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
checkCudaErrors(cudaGraphLaunch(instance, streamForGraph));
cudaDeviceSynchronize();
checkCudaErrors(cudaGraphExecDestroy(instance));
checkCudaErrors(cudaGraphDestroy(graph));
// Copy memory from device to host
checkCudaErrors(cudaMemcpy(block_times_h, block_times_d, sizeof(uint64_t)*blocksPerGrid*2, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(block_smids_h, block_smids_d, sizeof(uint32_t)*blocksPerGrid, cudaMemcpyDeviceToHost));
// Print block and SM info
for(int i=0; i<blocksPerGrid*2; i++){
if(i%2==0){
printf("%d, start: %lu\n", i/2, block_times_h[i]);
}
else{
printf("end: %lu\n", block_times_h[i]);
}
}
for(int i=0; i<blocksPerGrid; i++){
printf("%d, block sm id: %u\n", i, block_smids_h[i]);
}
printf("\n");
// Free allocated memory
cudaFree(block_times_d);
cudaFree(block_smids_d);
cudaFreeHost(block_times_h);
cudaFreeHost(block_smids_h);
cudaFreeHost(kernelNodeParams.kernelParams);
}
And GetSMID() and GlobalTimer64() are implemented as:
// Returns the ID of the SM this is executed on.
static __device__ __inline__ uint32_t GetSMID(void) {
uint32_t to_return;
asm volatile("mov.u32 %0, %%smid;" : "=r"(to_return));
return to_return;
}
__device__ inline uint64_t GlobalTimer64(void) {
uint32_t lo_bits;
uint64_t ret;
asm volatile("mov.u32 %0, %%clock;" : "=r"(lo_bits));
ret = 0;
ret |= lo_bits;
return ret;
}
And I’m getting the following output:
0, start: 1753298497183553536
end: 1753298497283554304
1, start: 1753298497183553536
end: 1753298497283554304
2, start: 1753298497183553536
end: 1753298497283554304
3, start: 1753298497183553536
end: 1753298497283554304
4, start: 1753298497183553536
end: 1753298497283554304
0, block sm id: 0
1, block sm id: 0
2, block sm id: 2
3, block sm id: 0
4, block sm id: 4
Is there any reason why the odd-numbered blocks would be getting the wrong SM id?