CUDA crashing when I iterate too many times

I have a program that iterates a simple “add-one” function 100 times, using shared memory with
a total grid size of 1024*1024 variables. When I run the program, it crashes CUDA. Code is posted
in full; any ideas what’s going on?

global void FILENAME(int *d_out, int d_in, int nIterations)
{
extern shared int s_data[];
int in = blockDim.x
blockIdx.x + threadIdx.x;

for (int k=0; k<nIterations; ++k)
{
   s_data[in] = d_in[in];
   __syncthreads();
   d_out[in] = s_data[in] + 1;
   int *temp = d_out;
   d_out = d_in;
   d_in = temp;
}

}

int main(int argc, char** argv)
{
int HOST_Array;
int dimA = 1024
1024;
int nIterations = 10;
int i;

int *device_OUT, *device_IN;

int numThreadsPerBlock = 256;
int numBlocks = dimA / numThreadsPerBlock;  

int sharedMemSize = numThreadsPerBlock * sizeof(int);
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);

HOST_Array = (int *) malloc(memSize);
cudaMalloc((void **) &device_IN, memSize);
cudaMalloc((void **) &device_OUT, memSize);

for (i=0; i<dimA; ++i)
{
    HOST_Array[i] = float(i) + 1;
}
for (i=0; i<16; ++i)
{
    printf("Host Array = %d\n", HOST_Array[i]);
}

clock_t CUDA = clock();

cudaMemcpy(device_IN, HOST_Array, memSize, cudaMemcpyHostToDevice);

dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
FILENAME <<<dimGrid, dimBlock, sharedMemSize>>> (device_OUT, device_IN, nIterations);

cudaThreadSynchronize();
cudaMemcpy(HOST_Array, device_OUT, memSize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy");

printf("CUDA Time = %f sec\n", ((double)clock() - CUDA)/CLOCKS_PER_SEC);
printf("\n\n");

for (i=0; i<16; ++i)
{
printf(“Host Array = %d\n”, HOST_Array[i]);
}

cudaFree(device_IN);
cudaFree(device_OUT);
free(HOST_Array);

}