Hello,
I am trying to optimize a serial application using cuda.The code I’ve written seems to compile just fine.However it fails at some point during execution.
I’ve narrowed it down to the troublesome line and cudaMemcpy gives an invalid argument error.I’m working on v3.2 SDK on a windows platform which does not allow me to build EmuRelease or EmuDebug projects.
Here is the code:
int start_kernel_search(int dimGridx, int dimGridy, int dimBlockx, int dimBlocky, int dimBlockz, int sharedMem, uint *cpu_memory_block,uint cpu_memory_block_size, long *cpu_found_index)
{
if(cpu_memory_block != NULL && cpu_found_index != NULL)
{
dim3 grid, block;
uint4 *gpu_memory_block;
long *gpu_found_index;
size_t memSize, foundSize;
cudaError_t err;
grid.x = dimGridx; // 1 when called
grid.y = dimGridy; // 1 when called
grid.z = 1;
block.x = dimBlockx; // 512 when called
block.y = dimBlocky; // 1 when called
block.z = dimBlockz; // 1 when called
memSize = cpu_memory_block_size * sizeof(uint4);
foundSize = 1 * sizeof(long);
err = cudaMalloc( (void**)&gpu_memory_block, memSize); // memSize is 96 in my test example
if(err != cudaSuccess)
{
printf("CUDA error(1): %s \n", cudaGetErrorString(err));
}
err = cudaMalloc( (void**)&gpu_found_index, foundSize);
if(err != cudaSuccess)
{
printf("CUDA error(2): %s \n", cudaGetErrorString(err));
}
err = cudaMemcpy(gpu_memory_block, (uint4*)cpu_memory_block, memSize, cudaMemcpyHostToDevice); // is this cast meaningful or not?
if(err != cudaSuccess)
{
printf("CUDA error(3): %s \n", cudaGetErrorString(err));
}
err = cudaMemcpy(gpu_found_index, cpu_found_index, foundSize, cudaMemcpyHostToDevice);
if(err != cudaSuccess)
{
printf("CUDA error(4): %s \n", cudaGetErrorString(err));
}
compute_hashes_on_memory_block_items<<<grid, block, sharedMem>>>(gpu_found_index, gpu_memory_block); // kernel executes correctly
err = cudaGetLastError();
if(err != cudaSuccess)
{
printf("CUDA error(5): %s \n", cudaGetErrorString(err));
}
cudaThreadSynchronize();
err = cudaGetLastError();
if(err != cudaSuccess)
{
printf("CUDA error(6): %s \n", cudaGetErrorString(err));
}
err = cudaMemcpy(gpu_found_index, cpu_found_index, foundSize, cudaMemcpyDeviceToHost); <== The error occurs here when trying to move mem from d to h
if(err != cudaSuccess)
{
printf("CUDA error(7): %s \n", cudaGetErrorString(err));
//print_error(cudaGetErrorString(cudaGetLastError()));
}
err = cudaMemcpy(gpu_memory_block, cpu_memory_block, memSize, cudaMemcpyDeviceToHost); <== Same error occurs at this point as well
if(err != cudaSuccess)
{
printf("CUDA error(8): %s \n", cudaGetErrorString(err));
//print_error(cudaGetErrorString(cudaGetLastError()));
}
cudaFree(gpu_found_index);
cudaFree(gpu_memory_block);
return 1;
}
else
{
return 0;
}
}
Does anybody know if there is a method of debugging a cuda application starting from v3.2 SDK without making use of NVIDIA parallel NSIGHT? (my hardware does not allow it)