Does the sample code provided below behave as expected by the CUDA spec? The documentation is difficult to decipher in this area.
I would expect both options for mallocMode
to produce the same result (3). But, they do not. Why is that?
//usr/local/cuda/bin/nvcc $0 -lcuda -lcudart --run; exit
//This is a self-running source file.
//chmod +x thisFile.cpp & ./thisFile.cpp to run it.
#include "cuda.h"
#include "cuda_runtime.h"
#include <memory.h>
#include <stdio.h>
// mallocMode = cuda works. mallocMode = cstd fails.
enum MallocMode { cstd, cuda } mallocMode = cstd;
struct Params {
uint8_t *pSrc;
uint8_t *pDst;
size_t size;
};
void hostFunc(void *pUserData) {
Params *pParams = (Params*)pUserData;
for (size_t i=0; i<pParams->size; i++)
pParams->pDst[i] = pParams->pSrc[i] | 2;
};
int main() {
cuInit(0);
CUdevice device;
cuDeviceGet(&device, 0);
CUcontext ctx;
cuDevicePrimaryCtxRetain(&ctx, device);
cuCtxPushCurrent(ctx);
size_t bufferSize = 1<<20;
uint8_t *hostSrc, *hostDst;
switch(mallocMode) {
case cstd:
hostSrc = (uint8_t*)malloc(bufferSize);
hostDst = (uint8_t*)malloc(bufferSize);
break;
case cuda:
cuMemAllocHost((void**)&hostSrc, bufferSize);
cuMemAllocHost((void**)&hostDst, bufferSize);
break;
}
uint8_t *devSrc;
cuMemAlloc((CUdeviceptr*)&devSrc, bufferSize);
cudaStream_t stream = nullptr;
memset(hostSrc, 1, bufferSize);
memset(hostDst, 4, bufferSize);
Params hostParams {hostSrc, hostDst, bufferSize};
cuLaunchHostFunc(stream, hostFunc, &hostParams);
cuMemcpyHtoDAsync((CUdeviceptr)devSrc, hostDst, bufferSize, stream);
cuMemcpyDtoHAsync(hostDst, (CUdeviceptr)devSrc, bufferSize, stream);
cuStreamSynchronize(stream);
printf("# %d\n", hostDst[0]); // 3 = success. 4 = failure.
return 0;
}