Collecting busy SM IDs

Hi
In the following code, I would like to collect the SM IDs that have executed the kernel. The kernel is a simple addition one and I have written this code:

__device__ uint get_smid(void) {
     uint ret;
     asm("mov.u32 %0, %smid;" : "=r"(ret) );
     return ret;
}
__global__ void simpleAdd(float *v, int n, vector<int> &smVector)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) {
    int sm = get_smid();
    smVector.push_back(sm);
    v[i] = v[i] + 1;
  }
}
...
int main()
{
  ...
  simpleAdd<<<numBlocks, blockSize>>>(deviceVector, n, smVector);
  ...
}

But the error is that calling a host function from the device kernel is not allowed. I also tried this code to access vector elements by [] operator instead of push_back but get the same error.

__device__ uint get_smid(void) {
     uint ret;
     asm("mov.u32 %0, %smid;" : "=r"(ret) );
     return ret;
}
__global__ void simpleAdd(float *v, int n, vector<int> &smVector)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) {
    int sm = get_smid();
    smVector[sm]++;
    v[i] = v[i] + 1;
  }
}
...
int main()
{
  ...
  smVector.resize(68);
  simpleAdd<<<numBlocks, blockSize>>>(deviceVector, n, smVector);
  ...
}

Any idea on how to achieve that?

If this is std vector, it cannot work. If it is your own vector, declare the functions as __host__ __device__.

Also note that %smid is bounded by %nsmid, and

The SM identifier numbering is not guaranteed to be contiguous, so %nsmid may be larger than the physical number of SMs in the device.

So you would need to use %nsmid as buffer size

Apart from the original question, which I still have that, using %nsmid gives different result than %smid. For example, for a short array on device, the %nsmid shows SM_68 while %smid shows SM_0.

That is no surprise, is it? nsmid is simply an upper bound for smid. they are not equivalent.

This code will print a histogram of used sm ids.

#include <iostream>
#include <map>

__device__ 
int get_smid(void) {
    int ret;
    asm("mov.u32 %0, %smid;" : "=r"(ret) );
    return ret;
}

__global__ 
void kernel(int* smidPerBlock){
    if(threadIdx.x == 0){
        smidPerBlock[blockIdx.x] = get_smid();
    }
}

int main(){
    int numBlocks = 4096;
    int* smidPerBlock; cudaMallocManaged(&smidPerBlock, sizeof(int) * numBlocks);
    kernel<<<numBlocks, 128>>>(smidPerBlock);
    cudaDeviceSynchronize();

    std::map<int,int> histogram;
    for(int i = 0; i <numBlocks; i++){
        histogram[smidPerBlock[i]]++;
    }
    for(const auto& pair : histogram){
        std::cout << pair.first << " " << pair.second << "\n";
    }
}
1 Like

OK Thank you very much.
I also found an answer to the first question with an example in this page

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.