Getting another weird error. Here is my code:
void GetMatches(int *pMatchList_CPU, int startIdx)
{
int blockCnt = 512;
int threadsPerBlock = 16;
int totalThreads = blockCnt * threadsPerBlock;
int *pMatchList_GPU;
int dataSize = totalThreads * sizeof(int);
CUDA_SAFE_CALL(cudaMalloc((void**) &pMatchList_GPU, dataSize));
CUDA_SAFE_CALL(cudaMemset((void*) pMatchList_GPU, 0, dataSize));
cudaGetMatches<<<blockCnt, threadsPerBlock>>>(pMatchList_GPU, startIdx);
CUDA_SAFE_CALL(cudaMemcpy(pMatchList_CPU, pMatchList_GPU, dataSize, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(pMatchList_GPU));
}
Everything appears to work fine up until the cudaMemcpy, which is gives the less than helpful error message “unspecified launch failure.”
Am I missing something obvious?
Edit: Just one additional note, pMatchList_CPU is allocated to be the same size as dataSize, in the calling method. I tried adding a cudaMallocHost call and copying from pMatchList_GPU to that first, but I get the same error.