cudaMemcpy: invalid argument issue

Hello,

I am trying to optimize a serial application using cuda.The code I’ve written seems to compile just fine.However it fails at some point during execution.

I’ve narrowed it down to the troublesome line and cudaMemcpy gives an invalid argument error.I’m working on v3.2 SDK on a windows platform which does not allow me to build EmuRelease or EmuDebug projects.

Here is the code:

int start_kernel_search(int dimGridx, int dimGridy, int dimBlockx, int dimBlocky, int dimBlockz, int sharedMem, uint *cpu_memory_block,uint cpu_memory_block_size, long *cpu_found_index)

{

  if(cpu_memory_block != NULL && cpu_found_index != NULL)

  {

    dim3 grid, block;

    uint4 *gpu_memory_block;

    long *gpu_found_index;

    size_t memSize, foundSize;

    cudaError_t err;

grid.x = dimGridx;  // 1 when called

    grid.y = dimGridy;  // 1 when called

    grid.z = 1; 

block.x = dimBlockx; // 512 when called

    block.y = dimBlocky; // 1 when called

    block.z = dimBlockz; // 1 when called 

memSize = cpu_memory_block_size * sizeof(uint4);

    foundSize = 1 * sizeof(long);

err = cudaMalloc( (void**)&gpu_memory_block, memSize); // memSize is 96 in my test example 

    if(err != cudaSuccess)

    {

	    printf("CUDA error(1): %s \n", cudaGetErrorString(err)); 

    }

err = cudaMalloc( (void**)&gpu_found_index, foundSize);

    if(err != cudaSuccess)

{

	    printf("CUDA error(2): %s \n", cudaGetErrorString(err));

    }

err = cudaMemcpy(gpu_memory_block, (uint4*)cpu_memory_block, memSize, cudaMemcpyHostToDevice); // is this cast meaningful or not?

if(err != cudaSuccess) 

    {

	    printf("CUDA error(3): %s \n", cudaGetErrorString(err));

    }

    err = cudaMemcpy(gpu_found_index, cpu_found_index, foundSize, cudaMemcpyHostToDevice); 

if(err != cudaSuccess) 

    {

	    printf("CUDA error(4): %s \n", cudaGetErrorString(err));

    }

compute_hashes_on_memory_block_items<<<grid, block, sharedMem>>>(gpu_found_index, gpu_memory_block); // kernel executes correctly

err = cudaGetLastError();

    if(err != cudaSuccess) 

    {

	    printf("CUDA error(5): %s \n", cudaGetErrorString(err));

    }

cudaThreadSynchronize();

    err = cudaGetLastError();

if(err != cudaSuccess) 

    {

	    printf("CUDA error(6): %s \n", cudaGetErrorString(err));

    }

err = cudaMemcpy(gpu_found_index, cpu_found_index, foundSize, cudaMemcpyDeviceToHost); <== The error occurs here when trying to move mem from d to h

    if(err != cudaSuccess) 

    {

	    printf("CUDA error(7): %s \n", cudaGetErrorString(err));

	    //print_error(cudaGetErrorString(cudaGetLastError()));

    }

err = cudaMemcpy(gpu_memory_block, cpu_memory_block, memSize, cudaMemcpyDeviceToHost); <== Same error occurs at this point as well

if(err != cudaSuccess) 

    {

	    printf("CUDA error(8): %s \n", cudaGetErrorString(err));

	    //print_error(cudaGetErrorString(cudaGetLastError()));

    }

cudaFree(gpu_found_index);

    cudaFree(gpu_memory_block);

return 1;

  }

  else

  {

    return 0;

  }

}

Does anybody know if there is a method of debugging a cuda application starting from v3.2 SDK without making use of NVIDIA parallel NSIGHT? (my hardware does not allow it)

The first and second arguments need to be swapped in the following calls:

cudaMemcpy(gpu_found_index, cpu_found_index, foundSize, cudaMemcpyDeviceToHost);

cudaMemcpy(gpu_memory_block, cpu_memory_block, memSize, cudaMemcpyDeviceToHost);

You are copying from the device to the host, and the destination pointer is the first argument in a cudaMemcpy() call.

Thanks for the quick reply.I overlooked the parameter order by mistake.