Hi, I write an application that processes 3D volumes of image data. It splits its work among multiple CPU cores using a separate host thread for each volume. What we’re trying to do now is allow one or more of those host threads to transfer their volumes to a GPU when GPU memory permits. Our implementation is using a mix of CUDA, NPP, and cuFFT.
It seems to work fine when only one thread’s work is shifted to the GPU, but to allow more than one at a time, we’re trying to create a stream on each host thread… and that is throwing the following error when it hits an NPP call:
Unhandled exception at 0x00007FFBD2487788 in NppSetStream.exe: Microsoft C++ exception: NppStatus at memory location 0x000000FD306FF400.
Below is the NppSetStream test program that replicates the issue (for me, anyway). In debug mode, the crash occurs only in the sumImage function, at the nppsSumGetBufferSize_32f call, when using the program’s “scenario 2” (where it launches a thread, waits for it to finish, then launches the second one). In release mode, it crashes in all scenarios.
On the other hand, if I use only the default stream (i.e., replace the nppSetStream(cStream) call with nppSetStream(0)), it seems to work all right for this program, but presents its own set of issues in our production application when multiple host threads access the GPU concurrently (though that is a separate issue).
#include "cuda_runtime.h"
#include "curand.h"
#include "curand_kernel.h"
#include "npp.h"
#include <stdio.h>
#include <Windows.h>
__global__ void setRandom(float* dData, int size, long long randSeed);
void initImageData(float* dData, int size, cudaStream_t cStream);
DWORD WINAPI runNppStreamTest(LPVOID jobId);
float sumImage(float* dImage, int nSize, cudaStream_t cStream)
{
Npp32f* dSum;
NppStatus nppErr;
cudaError_t cudaErr;
nppSetStream(cStream);
int bufferSize;
nppErr = nppsSumGetBufferSize_32f((int)nSize, &bufferSize); // CRASH OCCURS HERE IN SCENARIO 2
if (nppErr != NppStatus::NPP_NO_ERROR)
{
printf("Error obtaining buffer size in sumImage\n");
return 0.0f;
}
// allocate a device buffer to store sub-summations
Npp8u* dSumArray = NULL;
cudaErr = cudaMalloc((void**)&dSumArray, bufferSize);
if (cudaErr != cudaSuccess)
{
printf("Unable to allocate working array in sumImage\n");
return 0.0f;
}
// allocate a device pointer to store the final sum
cudaErr = cudaMalloc((void**)&dSum, sizeof(Npp32f));
if (cudaErr != cudaSuccess)
{
printf("Unable to allocate result buffer in sumImage\n");
cudaFree(dSumArray);
return 0.0f;
}
float hSum = 0.0f;
// Compute the sum
nppErr = nppsSum_32f(dImage, (int)nSize, dSum, dSumArray);
if (nppErr != NppStatus::NPP_NO_ERROR)
{
printf("Error computing sum in sumImage\n");
}
else
{
// Copy the result to the host
cudaMemcpy(&hSum, dSum, sizeof(Npp32f), cudaMemcpyDeviceToHost);
}
cudaFree(dSumArray);
cudaFree(dSum);
return hSum;
}
int main()
{
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
return 1;
}
HANDLE hThreadArray[2];
int jobId1 = 1;
int jobId2 = 2;
int test = -1;
int err = 0;
while ((test < 0) || (test > 2) || (err < 1))
{
printf("\nWhich test?\n\t0 = main thread sequential\n\t1 = background thread concurrent\n\t2 = background thread sequential\n: ");
err = scanf("%d", &test);
while (getchar() != '\n');
}
switch (test)
{
case 0:
// Test scenario 0: Run NPP tests sequentially in main thread
runNppStreamTest(&jobId1);
runNppStreamTest(&jobId2);
break;
case 1:
// Test scenario 1: Run NPP tests concurrently in new threads
hThreadArray[0] = CreateThread(NULL, 0, &runNppStreamTest, &jobId1, 0, NULL);
hThreadArray[1] = CreateThread(NULL, 0, &runNppStreamTest, &jobId2, 0, NULL);
WaitForMultipleObjects(2, hThreadArray, TRUE, INFINITE);
break;
case 2:
// Test scenario 2: Run NPP tests sequentally in new threads
hThreadArray[0] = CreateThread(NULL, 0, &runNppStreamTest, &jobId1, 0, NULL);
WaitForMultipleObjects(1, hThreadArray, TRUE, INFINITE);
hThreadArray[0] = CreateThread(NULL, 0, &runNppStreamTest, &jobId2, 0, NULL);
WaitForMultipleObjects(1, hThreadArray, TRUE, INFINITE);
break;
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
printf("Press enter to exit");
while (getchar() != '\n');
return 0;
}
DWORD WINAPI runNppStreamTest(LPVOID jobId)
{
cudaError_t cudaStatus;
float* image;
int dims[] = { 512, 512, 256 };
int size = dims[0] * dims[1] * dims[2];
cudaStream_t myStream;
cudaStatus = cudaStreamCreate(&myStream);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaStreamCreate failed in runNppStreamTest on job %d!\n", *(int*)(jobId));
return 1;
}
cudaStatus = cudaMalloc((void**)&image, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed in runNppStreamTest on job %d!\n", *(int*)(jobId));
return 1;
}
initImageData(image, size, myStream);
float sumValue = sumImage(image, size, myStream);
printf("Sum = %f on job %d\n", sumValue, *(int*)(jobId));
cudaFree(image);
cudaStatus = cudaSuccess;
if (myStream != NULL)
{
cudaStatus = cudaStreamDestroy(myStream);
}
return (cudaStatus != cudaSuccess) ? 1 : 0;
}
__global__ void setRandom(float* dData, int size, long long randSeed)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
curandState_t state;
curand_init(randSeed, 0, 0, &state);
for (int i = threadID; i < size; i += numThreads)
{
dData[i] = (float)(curand_uniform(&state));
}
}
void initImageData(float* dData, int size, cudaStream_t cStream)
{
setRandom << < 128, 256, 0, cStream >> >(dData, size, GetCurrentThreadId());
}