I have a small code that tries to calculate the infinite sum of 1/(n*log(n)*log(n). I am using CUDA 2.1. When
I compile with -arch=sm_13 and without it, I get drastically different register usages:
nvcc -O3 -use_fast_math --ptxas-options=“-v -O3” slowsum.cu -o slowsum
In file included from slowsum.cu:6:
nvcchelp.h:14:47: warning: backslash and newline separated by space
ptxas info : Compiling entry function ‘_Z20kernel_slowsum_floatliPf’
ptxas info : Used 8 registers, 4136+40 bytes smem, 4 bytes cmem[1]
nvcc -O3 -arch=sm_13 -use_fast_math --ptxas-options=“-v -O3” slowsum.cu -o slowsum
In file included from slowsum.cu:6:
nvcchelp.h:14:47: warning: backslash and newline separated by space
ptxas info : Compiling entry function ‘_Z20kernel_slowsum_floatliPf’
ptxas info : Used 20 registers, 4136+40 bytes smem, 4 bytes cmem[1]
This change in register usage makes a signficant change in performance because of occupancy. My question
is why does this happen?
Thanks,
Craig
[codebox]
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<string.h>
#include “nvcchelp.h”
#define ANSWER 2.109742801237
#define REALTYPE float
shared REALTYPE sdata[1024];
global void kernel_slowsum_float(long N, int width, REALTYPE *total) {
REALTYPE n=(REALTYPE)((blockIdx.x*blockDim.x+threadIdx.x)*width)+2.0;
REALTYPE sum=0.0;
REALTYPE v;
for (int i=0;i<width;i++) {
v=log(n);
sum+=1.0/(n*v*v);
n+=1.0;
}
// Now do the reduction
unsigned int tid = threadIdx.x;
sdata[tid] = sum;
__syncthreads();
for(unsigned int s=1; s < blockDim.x; s *= 2)
{
int index = 2 * s * tid;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) {
// printf("Working on %d %d %f\n", blockIdx.x,threadIdx.x,sdata[0]);
total[blockIdx.x] = sdata[0];
}
}
int main (int argc, char** argv) {
int deviceCount;
int dev=0;
cudaDeviceProp deviceProp;
/* Initialize */
cudaGetDeviceCount(&deviceCount);
if (deviceCount == 0) {
fprintf(stderr, "Error: No devices supporting CUDA.\n");
exit(1);
}
cudaGetDeviceProperties(&deviceProp, dev);
cudaGetDeviceProperties(&deviceProp, dev);
if (deviceProp.major < 1) {
fprintf(stderr,"Error: Device does not support CUDA\n");
exit(1);
}
printf("Using device: %d\n", dev, deviceProp.name);
cudaSetDevice(dev);
/* Setup the threads and grids */
long N=10;
int p=8;
for(int i=1;i<p;i++) N=N*10;
int width=1000;
int block_size=500;
printf(“Element size: %ld\n”, N);
printf("Block size %ld\n", block_size);
printf("Grid size %ld\n", N/width/block_size);
dim3 dimBlock(block_size,1);
int grid_size=N/width/block_size;
dim3 dimGrid(grid_size,1);
printf(“%d %d %d\n”, block_size, N, N/width/block_size);
/* Kernel */
double t1;
t1=gettime();
REALTYPE *total;
total=(REALTYPE *) malloc(sizeof(REALTYPE)*(grid_size));
for (int i=0;i<grid_size;i++) {
total[i]=0.0;
}
REALTYPE *d_total;
cudaMalloc((void **) &d_total, sizeof(REALTYPE)*(grid_size));
cudaMemcpy(d_total, total, sizeof(REALTYPE)*block_size, cudaMemcpyHostToDevice);
kernel_slowsum_float<<<dimGrid,dimBlock>>> (N,width,d_total);
cudaMemcpy(total, d_total, sizeof(REALTYPE)*(grid_size), cudaMemcpyDeviceToHost);
REALTYPE sum;
for (int i=0;i<grid_size;i++) {
printf("Got %d %lf\n", i, total[i]);
sum+=total[i];
}
printf(“Time to compute first kernel: %lf\n”, gettime()-t1);
printf("What did I get: %lf\n", sum);
CUT_EXIT(argc, argv);
}
[/codebox]