Cuda malloc error

Hi everyone,

I try to malloc memory by using malloc in cuda:
int nx = 512, ny = 512, nz = 100;
size_t size = nxnynz*sizeof(unsigned char);
cudaMalloc((void **) &cuda_a, size);

however, in the result I get error: cuda error: memcpy: invalid configuration argument.
my videocard is nvidia GF 540M

Is it very big size for cuda???
or I’m doing something wrong:
Full code:

void ApplyProjectionMatrix3DCuda(unsigned char *a, float b[16], unsigned char *result, int nx, int ny, int nz)
    {
        dim3 block_size(CUSTOM_BLOCK_SIZE_3D, CUSTOM_BLOCK_SIZE_3D, CUSTOM_BLOCK_SIZE_3D);
        dim3 n_blocks( (nx+ block_size.x - 1)/block_size.x, (ny + block_size.y - 1)/block_size.y, (nz + block_size.z - 1)/block_size.z   );

        unsigned char *cuda_a;
        float *cuda_b;
        unsigned char *cuda_result;
        size_t size = nx*ny*nz*sizeof(unsigned char);
        size_t sizeB = 16 * sizeof(float);
        cudaMalloc((void **) &cuda_a, size);
        cudaMalloc((void **) &cuda_b, sizeB);
        cudaMalloc((void **) &cuda_result, size);
        cudaMemset(cuda_a ,0,size);
        cudaMemset(cuda_b ,0,sizeB);
        cudaMemset(cuda_result ,0,size);
        cudaMemcpy(cuda_a, a, size, cudaMemcpyHostToDevice);
        cudaMemcpy(cuda_b, b, sizeB, cudaMemcpyHostToDevice);
        cudaMemcpy(cuda_result, result, size, cudaMemcpyHostToDevice);
        checkCUDAError3D("memcpy");
        ApplyProjectionMatrix3D   <<< block_size ,  n_blocks >>>  (cuda_a, cuda_b, cuda_result, nx, ny, nz);
        cudaThreadSynchronize();
        cudaMemcpy(a, cuda_a, size, cudaMemcpyDeviceToHost);
        cudaMemcpy(b, cuda_b, sizeB, cudaMemcpyDeviceToHost);
        cudaMemcpy(result, cuda_result, size, cudaMemcpyDeviceToHost);
        cudaFree(cuda_a);
        cudaFree(cuda_b);
        cudaFree(cuda_result);
    }

The invalid configuration argument error is probably left over from a previous kernel launch. Your error checking is rather sparse.

Your kernel launch appears to be configured incorrectly:

ApplyProjectionMatrix3D   <<< block_size ,  n_blocks >>>

You appear to have your two configuration arguments reversed. I believe it should be:

ApplyProjectionMatrix3D   <<< n_blocks ,  block_size >>>

Whenever you are having trouble with a cuda code, it’s always a good idea to do proper CUDA error checking on every API call and kernel call. Many of the CUDA sample codes demonstrate how.