Riedijk’s code works flawlessly on my linux boxes, with
8600GT and CUDA 3.2
GTX460 and CUDA 3.2
GTX480 and CUDA 3.2
and also on the old CUDA
GTX280 and CUDA 2.3
Try the code below, compiling it with nvcc code.cu and executing it with ./a.out 100
If it fails, describe your hardware, OS and CUDA stack
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
/* E.D. Riedijk */
__device__ uint get_smid(void) {
uint ret;
asm("mov.u32 %0, %smid;" : "=r"(ret) );
return ret;
}
__global__ void kern(int *sm){
if (threadIdx.x==0)
sm[blockIdx.x]=get_smid();
}
int main(int argc, char *argv[]){
int N = atoi(argv[1]);
int *sm, *sm_d;
sm = (int *) malloc(N*sizeof(*sm));
cudaMalloc((void**)&sm_d,N*sizeof(*sm_d));
kern<<<N,N>>>( sm_d);
cudaMemcpy(sm, sm_d, N*sizeof(int), cudaMemcpyDeviceToHost);
for (int i=0;i<N;i++)
printf("%d %d\n",i,sm[i]);
return 0;
}