this code fails when N >= 4096, it works fine at 2048 and lower multiples of 256, i just dont understand why this is happening.
can somebody shed some light on the matter?
thanks in advance
#include <stdio.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>
global void addOne(int* A, int N)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < N) A[i] += 1;
}
int main()
{
int N = 4096;
int i = 0;
int* h_a; //host array
int* d_a; //device array
h_a = (int*)malloc(N*sizeof(int));
cudaMalloc( (void**) &d_a, N);
for (i = 0; i < N; i++) h_a[i] = 0;
cudaMemcpy(d_a, h_a, sizeof(int)*N, cudaMemcpyHostToDevice);
double numThreadsPerBlock = 256;
double numBlocks = N / numThreadsPerBlock;
addOne<<<numThreadsPerBlock, numBlocks>>> (d_a, N);
cudaMemcpy(h_a, d_a, sizeof(int)*N, cudaMemcpyDeviceToHost);
for (i = 0; i < N; i++) assert(h_a[i] == 1);
system("pause");
return 0;
}