Thanks for that tip! However…
I’m finally back in the lab, where I can test the code and the procedure that’s calling the kernel (which is spawning several thousands of threads) now looks like this: (All I’m doing is a distance matrix on very large matrices.) Now cudaThreadSynchronize fails with “unknown error” at the call right before cudaMemcpy after the call to the kernel.
The getDim procedure scales all of the dimensions (units/thread, threads/block, numBlocks) so that no more than 1024 threads per block are being executed at once.
Any tips would be greatly appreciated.
The calling procedure:
__declspec (dllexport) int distanceMatrix (double *in, int Vn, int Vx, double *out) {
double *dev_a;
double *dev_b;
int threadDim;
int blockDim_1D;
int nSquared;
nSquared = Vn * Vn;
cudaError_t status = cudaSuccess;
getDim(Vn, &blockDim_1D, &threadDim);
//Because the distance matrix is by definition a square, the x and y
//dimensions are equal.
dim3 threads(threadDim, threadDim);
dim3 blocks(blockDim_1D, blockDim_1D);
// allocate CUDA memory for Input
status = cudaMalloc ( (void**) &dev_a, Vn * Vx * sizeof(double));
if(status != cudaSuccess)
{
fprintf(stderr, "cudaMalloc Input %s\n", cudaGetErrorString(status));
return 1;
}
// allocate CUDA memory for Output
status = cudaMalloc ( (void**) &dev_b, nSquared * sizeof(double));
if(status != cudaSuccess)
{
fprintf(stderr, "cudaMalloc Output %s\n", cudaGetErrorString(status));
return 1;
}
// Copy data to CUDA
status = cudaMemcpy ( dev_a, in, Vn * Vx* sizeof(double), cudaMemcpyHostToDevice );
if(status != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy Input %s\n", cudaGetErrorString(status));
return 1;
}
// Execute the kernel
distance_kernel<<<blocks,threads>>>( dev_a, dev_b, threads, Vn, Vx);
status = cudaThreadSynchronize();
if(status != cudaSuccess)
{
fprintf(stderr, "cudaThreadSynchronize before %s\n", cudaGetErrorString(status));
return 1;
}
// Copy data from CUDA
status = cudaMemcpy ( out, dev_b, nSquared * sizeof(double), cudaMemcpyDeviceToHost );
if(status != cudaSuccess)
{
if (status == cudaErrorInvalidValue)
std::cout << "cudaErrorInvalidValue" << std::endl;
else if (status == cudaErrorInvalidDevicePointer)
std::cout << "cudaErrorInvalidDevicePointer" << std::endl;
else if (status == cudaErrorInvalidMemcpyDirection)
std::cout << "cudaErrorInvalidMemcpyDirection" << std::endl;
else fprintf(stderr, “cudaMemcpy Device to Host 2 %s\n”, cudaGetErrorString(status));
return 1;
}
status = cudaThreadSynchronize();
if(status != cudaSuccess)
{
fprintf(stderr, "cudaThreadSynchronize after %s\n", cudaGetErrorString(status));
return 1;
}
// Free CUDA memory
status = cudaFree (dev_b);
if(status != cudaSuccess)
{
fprintf(stderr, "cudaFree %s\n", cudaGetErrorString(status));
return 1;
}
status = cudaFree (dev_a);
if(status != cudaSuccess)
{
fprintf(stderr, "cudaFree %s\n", cudaGetErrorString(status));
return 1;
}
return 0;
} // distanceMatrix