Hello
I am keen on knowing how the registers are assigned in CUDA kernel, because in my code registers is the limiting factor. My kernel is using way too many registers than expected. So, I tried to analyse register usage for a simple vector addition code.
#define L 1024
__global__ void add(int *A_dev, int *B_dev, int *C_dev)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid<L)
{
C_dev[tid]=A_dev[tid]+B_dev[tid];
tid += blockDim.x * gridDim.x;
}
}
int main (void)
{
int A[L],B[L],C[L];
int *A_dev,*B_dev,*C_dev;
for (int i=0;i<L;i++)
{
A[i]=1;
B[i]=1;
}
// allocate the memory on the GPU
cutilSafeCall(cudaMalloc( (void**)&A_dev, L * sizeof(int) ) );
cutilSafeCall(cudaMalloc( (void**)&B_dev, L * sizeof(int) ) );
cutilSafeCall(cudaMalloc( (void**)&C_dev, L * sizeof(int) ) );
cutilSafeCall(cudaMemcpy(A_dev, A, L *sizeof(int) , cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(B_dev, B, L *sizeof(int) , cudaMemcpyHostToDevice));
add<<< (L/512),512 >>> (A_dev,B_dev,C_dev);
( cudaThreadSynchronize() );
cutilSafeCall(cudaMemcpy(C,C_dev, L*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0;i<L;i++)
{
printf("C[%d] is %d\n",i,C[i]);
}
cudaFree( A_dev );
cudaFree( B_dev );
cudaFree( C_dev );
return 0;
}
I compiled the code using --ptxas-options=-v flag and got the following
ptxas info : Compiling entry function ‘Z3addPiS_S’ for ‘sm_20’
ptxas info : Function properties for Z3addPiS_S
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 13 registers, 56 bytes cmem[0]
Where are 13 registers being used? Is there any way to actually find out how the registers are assigned?
Any help would be appreciated.
Thanks