CUDA BUG? atomicAdd

Hi,

First let me say that I’m a newbie programming CUDA, so I might be missing something here. Anyway the code follows (using CUDA 2.1 - 64 bit version):

#define QUANT_NUMBERS_TO_SUM 100
#define BLOCK_SIZE 10

global void sum_kernel(int * numbersToSum, int * output) {
extern shared int data;

int x = blockIdx.x * blockDim.x + threadIdx.x; // global index
    
data[threadIdx.x] = numbersToSum[x]; 
__syncthreads();

int nextInterval;
for (int interval = 1; interval < blockDim.x; interval = nextInterval) {
    nextInterval = 2 * interval;
    
    int positionSum = threadIdx.x + interval;       
    if (threadIdx.x % nextInterval == 0 && positionSum < blockDim.x) data[threadIdx.x] += data[positionSum];
    __syncthreads();
}

if (threadIdx.x == 0) atomicAdd(output, data[threadIdx.x]);    

}

global void init(int * numbersToSum, int * output) {
int x = blockIdx.x * blockDim.x + threadIdx.x; // global index

if (x < QUANT_NUMBERS_TO_SUM) {
	numbersToSum[x] = x + 1;

	if (x == 0) output[0] = 0;
}

}

void Sum() {
int * dNumbers;
int * dSum;

cudaMalloc((void **) &dNumbers, QUANT_NUMBERS_TO_SUM * sizeof(int));
cudaMalloc((void **) &dSum, sizeof(int));

init<<<1, QUANT_NUMBERS_TO_SUM>>>(dNumbers, dSum);
cudaThreadSynchronize();

int nBlocks = (QUANT_NUMBERS_TO_SUM + BLOCK_SIZE - 1) / BLOCK_SIZE;
sum_kernel<<<nBlocks, BLOCK_SIZE>>>(dNumbers, dSum);
cudaThreadSynchronize();

int hSum[1];

cudaMemcpy(hSum, dSum, sizeof(int), cudaMemcpyDeviceToHost);

printf("Sum = %d", hSum[0]);

}

The sum kernel works fine for a BLOCK_SIZE up to 8. Greater values make weird results come up (correct result should be 5050).
Code seems correct, so anyone can please point me out my mistake or is this a CUDA bug?

Thanks

I think I found the problem. When launching the kernel I should have specified the size of the shared memory.