I have similar problem with a Tesla C1060 card having 4GB of memory.
cudaMalloc does not fail but when the kernel is invoked and i try to access the memory, the kernel crashes.
The code block is as below;
unsigned short* d_ImagePointer;
cutilSafeCall(cudaMalloc((void**) &d_ImagePointer, sizeof(unsigned short)102410248)); //16 MB
cudaMemcpy(d_lpwSaveNpoint,h_ImagePointer,sizeof(unsigned short)10241024*8, cudaMemcpyHostToDevice); //h_ImagePointer above is having a valid pointer to 16MB of host side memory
global kernel(unsigned short* d_ImagePointer)
int bufferlen = 102410248; //2 Bytes each index
if (*(d_ImagePointer+i) == 0)
//do something else
With this kernel crashes.
Does cudaMalloc has any limitation in terms of memory size?