When I use nvcc --ptxas-options=-v -arch=sm_86, I see register per thread: 10.
But when I use nsight compute to profile the kernel, I see register per thread: 16.
And here is the code
#include <cuda_runtime.h>
#define thread_per_block 512
__global__ void device_copy_scalar_kernel(int* d_in, int* d_out, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = idx; i < N; i += blockDim.x * gridDim.x) {
d_out[i] = d_in[i];
}
}
void device_copy_scalar(int* d_in, int* d_out, int N)
{
int blocks = (N + thread_per_block-1) / thread_per_block;
device_copy_scalar_kernel<<<blocks, thread_per_block>>>(d_in, d_out, N);
}
int main(){
int num = 1024000;
int *d_in, *d_out;
cudaMalloc(&d_in, num * sizeof(int));
cudaMalloc(&d_out, num * sizeof(int));
int *host_in = (int*)malloc(num * sizeof(int));
int *host_out = (int*)malloc(num * sizeof(int));
// initialize host_in, host_out
for (int i = 0; i < num; i++) {
host_in[i] = i;
host_out[i] = 0;
}
cudaMemcpy(d_in, host_in, num * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_out, host_out, num * sizeof(int), cudaMemcpyHostToDevice);
device_copy_scalar(d_in, d_out, num);
}
I also want to know what is " Driver Shared Memory Per Block" in launch statistics?I know static/dynamic shared memory, any documents about Driver Shared Memory?Thanks.