I have been experiencing some very strange behavior, I was working on a complex problem that required a fair number of streams and synchronization, I found a number of times where my execution would hang, naturally I was looking for race condition etc… What I finally boiled it down to surprised me, I have a single thread light weight kernels in the default stream with “no” resource requirement which reliably hangs.
First off NO its not the 5 second WDM timeout! I’m using Linux with a dedicated compute card, in fact I have tested it on multiple dedicated compute cards (GTX 770, GTX 750, GTX 560 and a GTX 470) on platforms running CUDA 5.0 to 6.0, with various driver versions, with the display manager on and off, and all have the same behavior. Hanging after about 2 seconds of “compute” with single or multiple kernel invocations. (code below)
I wrote a simple kernel that spins for a given length of time, using clock() and the device clock rate (have verified that it runs for approximately the correct amount of time, close enough for this demonstration). Then I simply repeatedly call this kernel in the default stream. Here each kernel runs for ~0.1 seconds, after a number of calls to this kernel (~17) everything hangs. I have tried with device and stream synchronization on and off, using kernels that run for a long time and kernels that run for a short time, they all have the same problem (although the shorter the kernels the longer it runs for). Note I discovered this behavior with a kernel that does not use clock(), I just use it to demonstrate.
The only solution I have found is to periodically reset the device and in my opinion this is not a solution!
Does anyone have any ideas as to whats going wrong here? Or what else I can do to further diagnose the problem?
Below is some very simple code to demonstrate the problem, You can change the length of each kernel run by editing kerTimeInSeconds, (note it uses Linux gettimeofday if some one wants to adapt it for windows and tell me if you get the same behavior that would be great)
#include <stdio.h> // printf
#include <iostream> // flush
#include <sys/time.h> // gettimeofday
#define CUDA_SAFE_CALL(value, errorMsg) \
{ \
cudaError_t cudaStat = value; \
if (cudaStat != cudaSuccess) \
{ \
fprintf(stderr, "ERROR: %s [ %s at line %d in file %s ]\n", \
errorMsg, cudaGetErrorString(cudaStat), __LINE__, __FILE__ ); \
exit(EXIT_FAILURE); \
} \
} \
__global__ void sleep(float Num, const int clockrate)
{
volatile clock_t start_clock = clock();
volatile clock_t clock_offset = 0;
volatile uint max = Num*(clockrate*1e3);
while (clock_offset < max )
{
clock_offset = clock() - start_clock;
}
}
int getComputCard()
{
cudaDeviceProp deviceProp;
int deviceCount,currentDevvice;
CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount),"Failed to get device count using cudaGetDeviceCount");
if (deviceCount > 0)
{
printf("There are %i CUDA enabled devices on this computer/node:\n", deviceCount);
int* devs = (int*)malloc(deviceCount*sizeof(int));
int cntComp = 0;
for (int device = 0; device < deviceCount; device++)
{
CUDA_SAFE_CALL( cudaSetDevice ( device ), "Failed to set device using cudaSetDevice");
// Check if the the current device is 'device'
CUDA_SAFE_CALL( cudaGetDevice(¤tDevvice), "Failed to get device using cudaGetDevice" );
if ( currentDevvice != device)
{
fprintf(stderr, "ERROR: Device not set.\n");
exit(EXIT_FAILURE);
}
CUDA_SAFE_CALL( cudaGetDeviceProperties(&deviceProp, device), "Failed to get device properties device using cudaGetDeviceProperties");
if ( deviceProp.kernelExecTimeoutEnabled )
{
printf(" Device %d: [ Display ] \"%s\" which has CUDA Capability %d.%d and a clock rate of %.2f GHz\n", device, deviceProp.name, deviceProp.major, deviceProp.minor, deviceProp.clockRate*1e-6);
}
else
{
printf(" Device %d: [ Compute ] \"%s\" which has CUDA Capability %d.%d and a clock rate of %.2f GHz\n", device, deviceProp.name, deviceProp.major, deviceProp.minor, deviceProp.clockRate*1e-6);
devs[cntComp] = device;
cntComp++;
}
}
if ( cntComp > 0)
return devs[0];
else
{
printf("Couldn't find a dedicated compute card, using device 0;\n");
return 0;
}
}
return -1;
}
int main(int argc, char *argv[])
{
float kerTimeInSeconds = 0.1; // Change this the alter the length of time each kernel invocation will run for (seconds)
cudaDeviceProp deviceProp;
int driverVersion, runtimeVersion;
struct timeval lstart, lend;
int cuDevice = -1;
int noReps = 4.0/(float)kerTimeInSeconds; // Calculate the number of calls to run for ~4 seconds
printf("\n --== Spin test ==--\n\n");
CUDA_SAFE_CALL( cudaDriverGetVersion (&driverVersion), "Failed to get driver version using cudaDriverGetVersion");
CUDA_SAFE_CALL( cudaRuntimeGetVersion(&runtimeVersion), "Failed to get run time version using cudaRuntimeGetVersion");
printf(" CUDA Driver Version %d.%d \n", driverVersion / 1000, (driverVersion % 100) / 10);
printf(" Runtime Version %d.%d \n\n", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
cuDevice = getComputCard();
if ( cuDevice >= 0 )
{
CUDA_SAFE_CALL( cudaSetDevice(cuDevice),"Failed to set CUDA device.");
CUDA_SAFE_CALL( cudaGetDeviceProperties(&deviceProp, cuDevice), "Failed to get device properties device using cudaGetDeviceProperties.");
CUDA_SAFE_CALL( cudaDeviceReset(), "Failed to reset device");
//CUDA_SAFE_CALL( cudaSetDeviceFlags( cudaDeviceScheduleBlockingSync ), "Failed to set device flags."); // Uncomment if you are interested
printf("\nRunning on device %d: \"%s\" which has CUDA Capability %d.%d and a clock rate of %.2f GHz\n", cuDevice, deviceProp.name, deviceProp.major, deviceProp.minor, deviceProp.clockRate*1e-6);
printf("Doing %i repetitions at %0.04f seconds each so a run time of ~%0.2f Seconds.\n\n", noReps, kerTimeInSeconds, noReps*kerTimeInSeconds);
int rr = 0;
int nRest = 0.9 / ( kerTimeInSeconds );
float time;
printf(" %i repetitions for each reset!\n\n", nRest );
gettimeofday(&lstart, NULL);
for ( int i = 0; i < noReps; i++, rr++)
{
sleep<<<1,1>>>(kerTimeInSeconds, deviceProp.clockRate);
CUDA_SAFE_CALL( cudaGetLastError(), "Error at kernel launch");
CUDA_SAFE_CALL( cudaDeviceSynchronize(), "Error at device synchronise");
gettimeofday(&lend, NULL);
time = ((lend.tv_sec - lstart.tv_sec)*1e6 + (lend.tv_usec - lstart.tv_usec));
printf("\r%05i : %07.5fs %6.2f%% complete.", i, time*1e-6, (i+1)/(float)noReps*100.0);
std::cout.flush();
if ( rr > nRest )
{
//CUDA_SAFE_CALL( cudaDeviceReset(), "Failed to reset device");
rr = 0;
}
}
CUDA_SAFE_CALL( cudaDeviceSynchronize(), "Error at D synch");
printf("\n Yes Done! (I wish I could get here)\n");
}
else
{
fprintf(stderr, "ERROR: Failed to find any CUDA enabled devices.\n");
exit(EXIT_FAILURE);
}
return 0;
}
and here is some sample output:
nvcc --compile -g -G -O3 -arch=sm_30 -x cu -o main.cu.o main.cu
nvcc --cudart static -g -G -link -o spintest main.cu.o
nvcc warning : The 'compute_10' and 'sm_10' architectures are deprecated, and may be removed in a future release.
./spintest
--== Spin test ==--
CUDA Driver Version 6.0
Runtime Version 6.0
There are 2 CUDA enabled devices on this computer/node:
Device 0: [ Compute ] "GeForce GTX 770" which has CUDA Capability 3.0 and a clock rate of 1.14 GHz
Device 1: [ Display ] "GeForce GTX 750 Ti" which has CUDA Capability 5.0 and a clock rate of 1.11 GHz
Running on device 0: "GeForce GTX 770" which has CUDA Capability 3.0 and a clock rate of 1.14 GHz
Doing 39 repetitions at 0.1000 seconds each so a run time of ~3.90 Seconds.
8 repetitions for each reset!
00017 : 1.74931s 46.15% complete.
Pleas help =/