%smid register returning 0

tdeme · July 23, 2025, 7:42pm

I have some CUDA code that launches a simple graph with only a single kernelNode and logs the block start and end times, as well as the SM IDs. Strangely, the output suggests that every other block is being assigned to SM 0 (all at the same time). This should not be possible, however, since the blocks have 1024 threads each and my RTX 4050 has a 1536 thread per SM limit (right?).

#include <stdio.h>
#include <cuda_runtime.h>
#include <vector>
#include "helper_cuda.h"
#include "benchmark_gpu_utilities.h"


__global__ void kernel_A(uint64_t *block_times, uint64_t *block_smids){
    uint64_t start_time = GlobalTimer64();

    // First, record the kernel and block start times
    // and block SM iDs
    if (threadIdx.x == 0) {
        block_times[blockIdx.x * 2] = start_time;
        block_smids[blockIdx.x] = GetSMID();
    }

    // Spin for 1 second
    while ((GlobalTimer64()-start_time)<(100*1000*1000)){
        continue;
    }

    // Record the kernel and block end times.
    if (threadIdx.x == 0) {
        block_times[blockIdx.x * 2 + 1] = GlobalTimer64();
    }

}


void setKernelNodeParams(cudaKernelNodeParams *kernelNodeParams, int blocksPerGrid, int threadsPerBlock, uint64_t **block_times_d, uint32_t **block_smids_d){

    void **kernelArgs = (void **) malloc(sizeof(void*)*2);
    kernelArgs[0] = (void *) block_times_d;
    kernelArgs[1] = block_smids_d;

    kernelNodeParams->gridDim = dim3(blocksPerGrid, 1, 1);
    kernelNodeParams->blockDim = dim3(threadsPerBlock, 1, 1);
    kernelNodeParams->sharedMemBytes = 0;
    kernelNodeParams->kernelParams = kernelArgs;
    kernelNodeParams->extra = NULL;

}


int main(){

    // Initializations

    int blocksPerGrid = 5;
    int threadsPerBlock = 1024;

    cudaStream_t streamForGraph;
    checkCudaErrors(cudaStreamCreate(&streamForGraph));

    cudaGraph_t graph;
    cudaGraphExec_t instance;


    // Kernel node
    cudaGraphNode_t kernelNode = (cudaGraphNode_t) malloc(sizeof(cudaGraphNode_t));

    uint64_t* block_times_d;
    uint64_t* block_times_h;
    uint32_t* block_smids_d;
    uint32_t* block_smids_h;

    cudaKernelNodeParams kernelNodeParams;

    cudaMalloc(&block_times_d, sizeof(uint64_t)*blocksPerGrid*2);
    cudaMalloc(&block_smids_d, sizeof(uint32_t)*blocksPerGrid);
    cudaMallocHost(&block_times_h, sizeof(uint64_t)*blocksPerGrid*2);
    cudaMallocHost(&block_smids_h, sizeof(uint32_t)*blocksPerGrid);

    setKernelNodeParams(&kernelNodeParams, blocksPerGrid, threadsPerBlock, &block_times_d, &block_smids_d);

    kernelNodeParams.func = (void *) kernel_A;

    // Create graph and add nodes

    checkCudaErrors(cudaGraphCreate(&graph, 0));

    checkCudaErrors(cudaGraphAddKernelNode(&kernelNode, graph, NULL, 0, &kernelNodeParams));

    // Instantiate and launch graph

    checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));

    checkCudaErrors(cudaGraphLaunch(instance, streamForGraph));

    cudaDeviceSynchronize();

    checkCudaErrors(cudaGraphExecDestroy(instance));
    checkCudaErrors(cudaGraphDestroy(graph));

    // Copy memory from device to host

    checkCudaErrors(cudaMemcpy(block_times_h, block_times_d, sizeof(uint64_t)*blocksPerGrid*2, cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(block_smids_h, block_smids_d, sizeof(uint32_t)*blocksPerGrid, cudaMemcpyDeviceToHost));

    // Print block and SM info
    for(int i=0; i<blocksPerGrid*2; i++){
        if(i%2==0){
            printf("%d, start: %lu\n", i/2, block_times_h[i]);
        }

        else{
            printf("end: %lu\n", block_times_h[i]);
        }
    
    }

    for(int i=0; i<blocksPerGrid; i++){
        printf("%d, block sm id: %u\n", i, block_smids_h[i]);
    }

    printf("\n");



    // Free allocated memory

    cudaFree(block_times_d);
    cudaFree(block_smids_d);
    cudaFreeHost(block_times_h);
    cudaFreeHost(block_smids_h);
    cudaFreeHost(kernelNodeParams.kernelParams);
}

And GetSMID() and GlobalTimer64() are implemented as:

// Returns the ID of the SM this is executed on.
static __device__ __inline__ uint32_t GetSMID(void) {
  uint32_t to_return;
  asm volatile("mov.u32 %0, %%smid;" : "=r"(to_return));
  return to_return;
}

__device__ inline uint64_t GlobalTimer64(void) {
  uint32_t lo_bits;
  uint64_t ret;
  asm volatile("mov.u32 %0, %%clock;" : "=r"(lo_bits));
  ret = 0;
  ret |= lo_bits;
  return ret;
}

And I’m getting the following output:

0, start: 1753298497183553536
end: 1753298497283554304
1, start: 1753298497183553536
end: 1753298497283554304
2, start: 1753298497183553536
end: 1753298497283554304
3, start: 1753298497183553536
end: 1753298497283554304
4, start: 1753298497183553536
end: 1753298497283554304
0, block sm id: 0
1, block sm id: 0
2, block sm id: 2
3, block sm id: 0
4, block sm id: 4

Is there any reason why the odd-numbered blocks would be getting the wrong SM id?

Robert_Crovella · July 23, 2025, 9:06pm

Your kernel is performing illegal activity. The results should not be trusted, relied upon, or interpreted.

Your kernel prototype for block_smids does not match your host definition, nor your allocation:

__global__ void kernel_A(uint64_t *block_times, uint64_t *block_smids){
                                                ^^^^^^^^
...
uint32_t* block_smids_d;
^^^^^^^^
...
cudaMalloc(&block_smids_d, sizeof(uint32_t)*blocksPerGrid);
                                  ^^^^^^^^

When I change the kernel prototype accordingly (to uint32_t *block_smids) then the errors mostly go away and the output looks reasonable to me.

There is a remaining error in your usage of cudaFreeHost (invalid argument). I’ll let you sort that out.

compute-sanitizer is a useful tool and is how I spotted these things. You might want to learn about it if you’re not familiar. I usually recommend that anytime someone is having trouble with a CUDA code, to use it. I also usually recommend rigorous proper CUDA error checking. I define “rigorous” as “at every sensible opportunity”. You are not doing that either.

Doing these things before asking others for help might save you some time. If you’re unable to interpret the results, the output from compute-sanitizer will be useful for those trying to help you.

tdeme · July 24, 2025, 2:36pm

Thank you! These tips worked! And thanks for suggesting compute-sanitizer, I’m new to CUDA programming, so this was valuable advice.

system · August 7, 2025, 2:36pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
any way to know on which SM a thread is running? CUDA Programming and Performance	22	11953	July 17, 2017
Global Timing and Kernels CUDA Programming and Performance	9	1452	July 17, 2013
Identifying SM number of a block CUDA Programming and Performance	1	4190	May 3, 2010
Which SM is each block using? CUDA Programming and Performance	1	825	July 15, 2013
What is the relationship between block_idx and sm_idx CUDA Programming and Performance	5	643	September 30, 2023
Collecting busy SM IDs CUDA Programming and Performance	5	434	March 12, 2024
Thread Block Scheduling on SM in Dynamic Parallelism CUDA Programming and Performance	6	3376	January 18, 2015
Block ids and SM ids CUDA Programming and Performance	4	2109	April 15, 2011
On smid CUDA Programming and Performance	1	1323	February 17, 2012
visibility what thread contains to what SM CUDA Programming and Performance	5	1802	August 5, 2013

%smid register returning 0

Related topics