occupancy bug

I noticed a program running much more slowly than I would expect today, and I think I’ve narrowed the problem down to the occupancy being different from what is reported by the occupancy calculator and the profiler. The following was the simplest program I could write that reproduces the behavior. The observed results are on my 8800 GTX. The program would almost definitely have to be altered to reproduce the behavior on a card with a different number of MP’s, but the modification needed may be as simple as changing NUM_BLOCKS_B to 2x the number of MPs you have, and possibly scaling NUM_BLOCKS_A appropriately (who knows).

[codebox]#include <stdio.h>

#define SAFE_CALL(call) do { \

cudaError_t err = call ; \

if(err != cudaSuccess) { \

   fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \

           __FILE__, __LINE__, cudaGetErrorString(err)); \

exit(1); \

} } while (0)

#define CHECK_ERROR() do { \

cudaError_t err = cudaGetLastError(); \

if(err != cudaSuccess) { \

   fprintf(stderr, "Cuda error detected in file '%s' in line %i : %s.\n", \

           __FILE__, __LINE__, cudaGetErrorString(err)); \

exit(1); \

} } while (0)

#define NUM_BLOCKS_A 1700

#define NUM_THREADS_PER_BLOCK_A 1

#define NUM_BLOCKS_B 32

#define NUM_THREADS_PER_BLOCK_B 32

global void dumbkernel1() {

return;

}

shared unsigned int s[32];

global void occ_kernel(unsigned int* out) {

size_t overall_thread_index = threadIdx.x + blockIdx.x*(blockDim.x);

size_t i;

unsigned int val = 1;

s[0] = 1;

//do lots of dumb computation with lots of read after write dependencies so that different occupancies will have different runtimes

for(i = 0;i < 100000;++i) {

val = val + 3;

val = val*2;

val = val+ 5;

val = val*12;

}

out[overall_thread_index] = val;

}

int main() {

unsigned int* out;

SAFE_CALL(cudaMalloc((void**)(&out),NUM_BLOCKS_BNUM_THREADS_PER_BLOCK_Bsizeof(unsigne

d int)));

dim3 grid_dim_a,block_dim_a;

grid_dim_a.x = NUM_BLOCKS_A;

grid_dim_a.y = 1;

grid_dim_a.z = 1;

block_dim_a.x = NUM_THREADS_PER_BLOCK_A;

block_dim_a.y = 1;

block_dim_a.z = 1;

dim3 grid_dim_b,block_dim_b;

grid_dim_b.x = NUM_BLOCKS_B;

grid_dim_b.y = 1;

grid_dim_b.z = 1;

block_dim_b.x = NUM_THREADS_PER_BLOCK_B;

block_dim_b.y = 1;

block_dim_b.z = 1;

dumbkernel1<<<grid_dim_a,block_dim_a,13000>>>();

CHECK_ERROR();

SAFE_CALL(cudaThreadSynchronize());

dumbkernel1<<<grid_dim_a,block_dim_a>>>();

CHECK_ERROR();

SAFE_CALL(cudaThreadSynchronize());

//uncomment one of the following code blocks at a time and run to get the numbers I observed

/occupancy calculator and profiler say occupancy is 1 warp, profiler says runtime = 16207.456/

/*occ_kernel<<<grid_dim_b,block_dim_b,13000>>>(out);

CHECK_ERROR();

SAFE_CALL(cudaThreadSynchronize());*/

/occupancy calculator and profiler say occupancy is 2 warps, profiler says runtime = 8106.272/

/*occ_kernel<<<grid_dim_b,block_dim_b,5700>>>(out);

CHECK_ERROR();

SAFE_CALL(cudaThreadSynchronize());*/

/occupancy calculator and profiler say occupancy is 2 warps, profiler says runtime = 16207.456, so I think we’re really only getting 1 warp of occupancy/

/*occ_kernel<<<grid_dim_b,block_dim_b,6000>>>(out);

CHECK_ERROR();

SAFE_CALL(cudaThreadSynchronize());*/

printf(“done\n”);

return 0;

}[/codebox]

Edit: simplified program slightly and fixed bad grammar