My main entry function initializes CUDA, allocates device memory, copies data from host and runs a kernel. If I run the kernel in a loop it works just fine. If my main.cu explicitly calls the global function more than once, the previous memory allocation fails! Universal causality meltdown?? Possibly a compiler problem??
// copy matrix to device
assert(~cudaMalloc((void**)&DeviceMatrix1, NodeCount * sizeof(YEEBOX)));
assert(~cudaMalloc((void**)&DeviceMatrix2, NodeCount * sizeof(YEEBOX)));
if (DeviceMatrix1 == 0 || DeviceMatrix2 == 0) {
_tprintf(_T("Not enough Memory"));
goto safeExit;}
assert(~cudaMemcpy((void*)DeviceMatrix1, HostMatrix, NodeCount * sizeof(YEEBOX),
cudaMemcpyHostToDevice));
assert(~cudaMemcpy((void*)DeviceMatrix2, HostMatrix, NodeCount * sizeof(YEEBOX),
cudaMemcpyHostToDevice));
// setup display
CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)OpenWindow, NULL, 0, NULL);
// begin computation
_tprintf(_T("Iterating for %i cycles.\n"), M.t);
EHfield <<<gridBlock, dimBlock>>> ((YEEBOX*)DeviceMatrix1, (YEEBOX*)DeviceMatrix2,
M.x, M.y, G.y, M.dx, M.dt);
assert(~cudaThreadSynchronize());
Another
EHfield <<<gridBlock, dimBlock>>> ((YEEBOX*)DeviceMatrix1, (YEEBOX*)DeviceMatrix2, M.x, M.y, G.y,
M.dx, M.dt);
assert(~cudaThreadSynchronize());
will cause the cudaMalloc to return 0 even though the kernel hasn’t been called yet.
I’m defeated on this one. Word up to tmurray! You always know what’s goin’ on!