Synchronization spins CPU under Linux - Tesla driver bug

Under the following configuration:
HW: Tesla K20
OS: Fedora 21 - 3.18.7-200.fc21.x86_64 kernel
Nvidia driver: 346.35
Cuda: 6.5

We did notice that basically any blocking synchronization spins the CPU as if it was implemented internally in your driver with a busy wait like construct. We could observe this with both cudaStreamSynchronize() and cudaEventSynchronize(). Of course it renders the particular system useless for compute since it increases the CPU load to an unbearable level. Under Windows, we don’t see the same problem.

Here you are with a few lines of dummy code which can reproduce the issue:

#include <stdio.h>
#include <stdint.h>
#include <signal.h>
#include <time.h>
#include <unistd.h>
#include <cuda_runtime.h>

#define BUFFER_SIZE (1u << 27) /* 100 MiB */

#define CUDA_CHECK(x) \
do \
{ \
	if (x != cudaSuccess) \
		return -1; \
} while(0)

static bool stop = false;

static void handler(int signal)
{
	stop = true;
}

static double now()
{
	struct timespec time = {0};
	clock_gettime(CLOCK_MONOTONIC_RAW, &time);
	return (time.tv_sec * 1000000000 + time.tv_nsec) / 1000000000.0;
}

int main(int argc, char **argv)
{
	/* Handle SIGINT */
	{
		printf("Handle SIGINT signals");
		struct sigaction act = {0};
		act.sa_handler = handler;
		if (sigaction(SIGINT, &act, NULL))
			return -1;
	}

	/* Initialise CUDA */
	{
		int numDevices;
		CUDA_CHECK(cudaGetDeviceCount(&numDevices));
		printf("Found %d CUDA capable devices\n", numDevices);
		if (numDevices < 1)
		{
			printf("No devices found, exiting.");
			return -1;
		}

		CUDA_CHECK(cudaSetDevice(0));
	}

	/* Make a stream and allocate some buffers */
	cudaStream_t stream;
	uint8_t *inputHost1, *inputHost2, *outputHost;
	uint8_t *inputDevice1, *inputDevice2, *outputDevice;
	{
		printf("Creating stream\n");
		CUDA_CHECK(cudaStreamCreate(&stream));

		printf("Allocating host buffers\n");
		CUDA_CHECK(cudaMallocHost(&inputHost1, BUFFER_SIZE));
		CUDA_CHECK(cudaMallocHost(&inputHost2, BUFFER_SIZE));
		CUDA_CHECK(cudaMallocHost(&outputHost, BUFFER_SIZE));

		printf("Allocating device buffers\n");
		CUDA_CHECK(cudaMalloc(&inputDevice1, BUFFER_SIZE));
		CUDA_CHECK(cudaMalloc(&inputDevice2, BUFFER_SIZE));
		CUDA_CHECK(cudaMalloc(&outputDevice, BUFFER_SIZE));
	}

	/* Initialise input data */
	printf("Initialising data\n");
	for (size_t index = 0; index < BUFFER_SIZE; index++)
	{
		/* I know, I know, this is garbage */
		inputHost1[index] = index % 10;
		inputHost2[index] = ((index + 2) >> 1) % 10;
	}

	/* Work loop */
	double memcpyTime = 0.0;
	double syncTime = 0.0;
	double start = now();
	double previous = start;
	double current;
	uint64_t iterations = 0;

	printf("Starting work\n");
	while (stop == false)
	{
		/* Upload data */
		CUDA_CHECK(cudaMemcpyAsync(
					inputDevice1, inputHost1, BUFFER_SIZE,
					cudaMemcpyDeviceToHost, stream));
		CUDA_CHECK(cudaMemcpyAsync(
					inputDevice2, inputHost2, BUFFER_SIZE,
					cudaMemcpyDeviceToHost, stream));

		/* Launch kernel */

		/* Download data */
		CUDA_CHECK(cudaMemcpyAsync(
					outputHost, outputDevice, BUFFER_SIZE,
					cudaMemcpyHostToDevice, stream));

		current = now();
		memcpyTime += current - previous;
		previous = current;

                /* the spinning on the CPU coccurs here */
		CUDA_CHECK(cudaStreamSynchronize(stream));

                /* this is a viable work-around for the problem, but we'd prefer to see a driver-fix */
		//while (cudaStreamQuery(stream) != cudaSuccess)
		//	usleep(100);

		current = now();
		syncTime += current - previous;
		previous = current;

		iterations++;
	}

	double end = now();

	/* Print timing */
	printf("Spent %lf seconds in cudaMemcpyAsync()\n", memcpyTime);
	printf("Spent %lf seconds in cudaStreamSynchronize()\n", syncTime);
	printf("Ran at a speed of %lf iterations/second\n", iterations/(end - start));

	/* Verify output data */
	printf("Veryfying data\n");
	for (size_t index = 0; index < BUFFER_SIZE; index++)
	{
		if (outputHost[index] != inputHost1[index] + inputHost2[index])
		{
			printf("INVALID RESULT at index %llu!\n", index);
			break;
		}
	}

	/* Cleanup stream and buffers */
	{
		printf("Freeing device buffers\n");
		CUDA_CHECK(cudaFree(inputDevice1));
		CUDA_CHECK(cudaFree(inputDevice2));
		CUDA_CHECK(cudaFree(outputDevice));

		printf("Freeing host buffers\n");
		CUDA_CHECK(cudaFreeHost(inputHost1));
		CUDA_CHECK(cudaFreeHost(inputHost2));
		CUDA_CHECK(cudaFreeHost(outputHost));

		printf("Destroying stream\n");
		CUDA_CHECK(cudaStreamDestroy(stream));
	}


	return 0;
}

Which driver version would you recommend us to use, and what is your ETA to reduce your driver’s CPU load to a reasonable level - in case it has not been fixed already?

You can find a viable workaround as a comment in the code as well.

Thank you very much in advance for your co-operation.

Please note that these forums are designed as a community platform, not a bug reporting channel. For bug reports you would want to use the bug reporting form linked from the CUDA registered developer website.

Note that Fedora 21 is not a supported Linux platform for CUDA 6.5 per NVIDIA’s documentation:
http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-linux/index.html

You may observe a difference in behavior if you set alternate device flags:

http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g18074e885b4d89f5a0fe1beab589e0c8

You might also want to try using CUDA 7 RC - it is newer, and it explicitly indicates Fedora 21 as a supported Distro/Version.

Thank you, this latter proved to be very helpful. cudaDeviceScheduleBlockingSync turned out to be a solution, however I am not sure I understand why does one need to set this behaviour explicitly, as least this is what I think what blocking sync is ought to mean.

Basically, I see and understand the meaning of “spin” vs “yield” defined in the document, but cudaDeviceScheduleBlockingSync is essentially defined by “blocking sync does what it does”. Could you please shed some light on this?