Extreme performance degradation (<1/100) due to allocate unified memory area?

Hi,

I have experienced extreme performance (1/100 or less) loss if I allocate unified memory (1024 Mbyte) even if I do not use (yet). Maybe I do something wrong, but what?

CUDA 8. (same results in 7.5) Gpu: GTX1070 (8Gbyte) CPU: i7-6700K 4Ghz 32Gbyte.

Program’s output:

Waste (unused allocated area) size is: 1073741824

Kernel use Gpu mem waste in gpu mem
kernel execution time: 15.0 ms

Kernel use Gpu mem, waste in unified (cpu) mem
kernel execution time: 1536.0 ms

Kernel use unified (cpu) mem, waste in gpu mem
kernel execution time: 110.0 ms

Kernel use unified (cpu) mem, waste in unified (cpu) mem
kernel execution time: 1640.0 ms

{1,2,3,4,5} + {10,20,30,40,50} = {11,22,33,44,55}
Press any key to continue . . .

Here is code (based on cuda default template) :

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <ctime>


cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size, bool managed, bool wmanaged, int wsize);

__global__ void addKernel(int *c, const int *a, const int *b)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	c[i] = a[i] + b[i];
}

int main()
{
	const int arraySize = 5;
	const int a[arraySize] = { 1, 2, 3, 4, 5 };
	const int b[arraySize] = { 10, 20, 30, 40, 50 };
	int c[arraySize] = { 0 };
	cudaError_t cudaStatus;
	int wsize = 1024 * 1024 * 1024;
	printf("\r\n");
	printf("Waste (unused allocated area) size is: %d\r\n", wsize);
	printf("\r\n");
	printf("Kernel use Gpu mem waste in gpu mem\r\n");
	cudaStatus = addWithCuda(c, a, b, arraySize, false, false, wsize);
	printf("\r\n");
	printf("Kernel use Gpu mem, waste in unified (cpu) mem\r\n");
	cudaStatus = addWithCuda(c, a, b, arraySize, false, true, wsize);
	printf("\r\n");
	printf("Kernel use unified (cpu) mem, waste in gpu mem\r\n");
	cudaStatus = addWithCuda(c, a, b, arraySize, true, false, wsize);
	printf("\r\n");
	printf("Kernel use unified (cpu) mem, waste in unified (cpu) mem\r\n");
	cudaStatus = addWithCuda(c, a, b, arraySize, true, true, wsize);
	printf("\r\n");
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "addWithCuda failed!");
		return 1;
	}

	printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
		c[0], c[1], c[2], c[3], c[4]);

	// cudaDeviceReset must be called before exiting in order for profiling and
	// tracing tools such as Nsight and Visual Profiler to show complete traces.
	cudaStatus = cudaDeviceReset();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaDeviceReset failed!");
		return 1;
	}
	return 0;
}

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size, bool managed, bool wmanaged, int wsize)
{
	
	int *dev_a = 0;
	int *dev_b = 0;
	int *dev_c = 0;
	int *dev_waste = 0;
	int csize = 1 * 1024 * 1024;
	cudaError_t cudaStatus;

	// Choose which GPU to run on, change this on a multi-GPU system.
	cudaStatus = cudaSetDevice(0);
	// Allocate GPU buffers for three vectors (two input, one output)    .
	if (managed)
	{
		cudaStatus = cudaMallocManaged((void**)&dev_c, csize * sizeof(int));
		cudaStatus = cudaMallocManaged((void**)&dev_a, csize * sizeof(int));
		cudaStatus = cudaMallocManaged((void**)&dev_b, csize * sizeof(int));

	}
	else
	{
		cudaStatus = cudaMalloc((void**)&dev_c, csize * sizeof(int));
		cudaStatus = cudaMalloc((void**)&dev_a, csize * sizeof(int));
		cudaStatus = cudaMalloc((void**)&dev_b, csize * sizeof(int));
	}
	//Allocate waste
	if (wsize)
	{
		if (wmanaged)
			cudaStatus = cudaMallocManaged((void**)&dev_waste, wsize);
		else
			cudaStatus = cudaMalloc((void**)&dev_waste, wsize);
	}
	// Copy input vectors from host memory to GPU buffers.
	cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
	cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
	clock_t t1 = clock();
	for (int i = 0; i < 128; i++)
	{
		// Launch a kernel on the GPU.
		dim3 grid(csize / 1024);
		dim3 block(1024);
		addKernel << <grid, block >> >(dev_c, dev_a, dev_b);
		cudaStatus = cudaGetLastError();
		// cudaDeviceSynchronize waits for the kernel to finish, and returns
		// any errors encountered during the launch.
		cudaStatus = cudaDeviceSynchronize();
	}
	clock_t t2 = clock();
	double d = t2 - t1;
	d /= CLOCKS_PER_SEC;
	d *= 1000;
	printf("kernel execution time: %.1f ms\r\n", d);
	// Copy output vector from GPU buffer to host memory.
	cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);

	cudaFree(dev_c);
	cudaFree(dev_a);
	cudaFree(dev_b);
	if (wsize)
		cudaFree(dev_waste);
	return cudaStatus;
}