%smid register returning 0

I have some CUDA code that launches a simple graph with only a single kernelNode and logs the block start and end times, as well as the SM IDs. Strangely, the output suggests that every other block is being assigned to SM 0 (all at the same time). This should not be possible, however, since the blocks have 1024 threads each and my RTX 4050 has a 1536 thread per SM limit (right?).

#include <stdio.h>
#include <cuda_runtime.h>
#include <vector>
#include "helper_cuda.h"
#include "benchmark_gpu_utilities.h"


__global__ void kernel_A(uint64_t *block_times, uint64_t *block_smids){
    uint64_t start_time = GlobalTimer64();

    // First, record the kernel and block start times
    // and block SM iDs
    if (threadIdx.x == 0) {
        block_times[blockIdx.x * 2] = start_time;
        block_smids[blockIdx.x] = GetSMID();
    }

    // Spin for 1 second
    while ((GlobalTimer64()-start_time)<(100*1000*1000)){
        continue;
    }

    // Record the kernel and block end times.
    if (threadIdx.x == 0) {
        block_times[blockIdx.x * 2 + 1] = GlobalTimer64();
    }

}


void setKernelNodeParams(cudaKernelNodeParams *kernelNodeParams, int blocksPerGrid, int threadsPerBlock, uint64_t **block_times_d, uint32_t **block_smids_d){

    void **kernelArgs = (void **) malloc(sizeof(void*)*2);
    kernelArgs[0] = (void *) block_times_d;
    kernelArgs[1] = block_smids_d;

    kernelNodeParams->gridDim = dim3(blocksPerGrid, 1, 1);
    kernelNodeParams->blockDim = dim3(threadsPerBlock, 1, 1);
    kernelNodeParams->sharedMemBytes = 0;
    kernelNodeParams->kernelParams = kernelArgs;
    kernelNodeParams->extra = NULL;

}


int main(){

    // Initializations

    int blocksPerGrid = 5;
    int threadsPerBlock = 1024;

    cudaStream_t streamForGraph;
    checkCudaErrors(cudaStreamCreate(&streamForGraph));

    cudaGraph_t graph;
    cudaGraphExec_t instance;


    // Kernel node
    cudaGraphNode_t kernelNode = (cudaGraphNode_t) malloc(sizeof(cudaGraphNode_t));

    uint64_t* block_times_d;
    uint64_t* block_times_h;
    uint32_t* block_smids_d;
    uint32_t* block_smids_h;

    cudaKernelNodeParams kernelNodeParams;

    cudaMalloc(&block_times_d, sizeof(uint64_t)*blocksPerGrid*2);
    cudaMalloc(&block_smids_d, sizeof(uint32_t)*blocksPerGrid);
    cudaMallocHost(&block_times_h, sizeof(uint64_t)*blocksPerGrid*2);
    cudaMallocHost(&block_smids_h, sizeof(uint32_t)*blocksPerGrid);

    setKernelNodeParams(&kernelNodeParams, blocksPerGrid, threadsPerBlock, &block_times_d, &block_smids_d);

    kernelNodeParams.func = (void *) kernel_A;

    // Create graph and add nodes

    checkCudaErrors(cudaGraphCreate(&graph, 0));

    checkCudaErrors(cudaGraphAddKernelNode(&kernelNode, graph, NULL, 0, &kernelNodeParams));

    // Instantiate and launch graph

    checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));

    checkCudaErrors(cudaGraphLaunch(instance, streamForGraph));

    cudaDeviceSynchronize();

    checkCudaErrors(cudaGraphExecDestroy(instance));
    checkCudaErrors(cudaGraphDestroy(graph));

    // Copy memory from device to host

    checkCudaErrors(cudaMemcpy(block_times_h, block_times_d, sizeof(uint64_t)*blocksPerGrid*2, cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(block_smids_h, block_smids_d, sizeof(uint32_t)*blocksPerGrid, cudaMemcpyDeviceToHost));

    // Print block and SM info
    for(int i=0; i<blocksPerGrid*2; i++){
        if(i%2==0){
            printf("%d, start: %lu\n", i/2, block_times_h[i]);
        }

        else{
            printf("end: %lu\n", block_times_h[i]);
        }
    
    }

    for(int i=0; i<blocksPerGrid; i++){
        printf("%d, block sm id: %u\n", i, block_smids_h[i]);
    }

    printf("\n");



    // Free allocated memory

    cudaFree(block_times_d);
    cudaFree(block_smids_d);
    cudaFreeHost(block_times_h);
    cudaFreeHost(block_smids_h);
    cudaFreeHost(kernelNodeParams.kernelParams);
}

And GetSMID() and GlobalTimer64() are implemented as:

// Returns the ID of the SM this is executed on.
static __device__ __inline__ uint32_t GetSMID(void) {
  uint32_t to_return;
  asm volatile("mov.u32 %0, %%smid;" : "=r"(to_return));
  return to_return;
}

__device__ inline uint64_t GlobalTimer64(void) {
  uint32_t lo_bits;
  uint64_t ret;
  asm volatile("mov.u32 %0, %%clock;" : "=r"(lo_bits));
  ret = 0;
  ret |= lo_bits;
  return ret;
}

And I’m getting the following output:

0, start: 1753298497183553536
end: 1753298497283554304
1, start: 1753298497183553536
end: 1753298497283554304
2, start: 1753298497183553536
end: 1753298497283554304
3, start: 1753298497183553536
end: 1753298497283554304
4, start: 1753298497183553536
end: 1753298497283554304
0, block sm id: 0
1, block sm id: 0
2, block sm id: 2
3, block sm id: 0
4, block sm id: 4

Is there any reason why the odd-numbered blocks would be getting the wrong SM id?

Your kernel is performing illegal activity. The results should not be trusted, relied upon, or interpreted.

Your kernel prototype for block_smids does not match your host definition, nor your allocation:

__global__ void kernel_A(uint64_t *block_times, uint64_t *block_smids){
                                                ^^^^^^^^
...
uint32_t* block_smids_d;
^^^^^^^^
...
cudaMalloc(&block_smids_d, sizeof(uint32_t)*blocksPerGrid);
                                  ^^^^^^^^

When I change the kernel prototype accordingly (to uint32_t *block_smids) then the errors mostly go away and the output looks reasonable to me.

There is a remaining error in your usage of cudaFreeHost (invalid argument). I’ll let you sort that out.

compute-sanitizer is a useful tool and is how I spotted these things. You might want to learn about it if you’re not familiar. I usually recommend that anytime someone is having trouble with a CUDA code, to use it. I also usually recommend rigorous proper CUDA error checking. I define “rigorous” as “at every sensible opportunity”. You are not doing that either.

Doing these things before asking others for help might save you some time. If you’re unable to interpret the results, the output from compute-sanitizer will be useful for those trying to help you.

2 Likes

Thank you! These tips worked! And thanks for suggesting compute-sanitizer, I’m new to CUDA programming, so this was valuable advice.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.