why does this code fail at assertion?

this code fails when N >= 4096, it works fine at 2048 and lower multiples of 256, i just dont understand why this is happening.
can somebody shed some light on the matter?
thanks in advance

#include <stdio.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>

global void addOne(int* A, int N)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < N) A[i] += 1;
}

int main()
{
int N = 4096;
int i = 0;
int* h_a; //host array
int* d_a; //device array

h_a = (int*)malloc(N*sizeof(int));
cudaMalloc( (void**) &d_a, N);

for (i = 0; i < N; i++) h_a[i] = 0;

cudaMemcpy(d_a, h_a, sizeof(int)*N, cudaMemcpyHostToDevice);

double numThreadsPerBlock = 256;
double numBlocks = N / numThreadsPerBlock;

addOne<<<numThreadsPerBlock, numBlocks>>> (d_a, N);

cudaMemcpy(h_a, d_a, sizeof(int)*N, cudaMemcpyDeviceToHost);

for (i = 0; i < N; i++) assert(h_a[i] == 1);

system("pause");

return 0;

}

cudaMalloc( (void**) &d_a, N);
should be
cudaMalloc( (void**) &d_a, N*sizeof(int));
:ph34r: NEXT

omg cant believe i overlooked that … thanks