Hi,
I have experienced extreme performance (1/100 or less) loss if I allocate unified memory (1024 Mbyte) even if I do not use (yet). Maybe I do something wrong, but what?
CUDA 8. (same results in 7.5) Gpu: GTX1070 (8Gbyte) CPU: i7-6700K 4Ghz 32Gbyte.
Program’s output:
Waste (unused allocated area) size is: 1073741824
Kernel use Gpu mem waste in gpu mem
kernel execution time: 15.0 ms
Kernel use Gpu mem, waste in unified (cpu) mem
kernel execution time: 1536.0 ms
Kernel use unified (cpu) mem, waste in gpu mem
kernel execution time: 110.0 ms
Kernel use unified (cpu) mem, waste in unified (cpu) mem
kernel execution time: 1640.0 ms
{1,2,3,4,5} + {10,20,30,40,50} = {11,22,33,44,55}
Press any key to continue . . .
Here is code (based on cuda default template) :
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <ctime>
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size, bool managed, bool wmanaged, int wsize);
__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
c[i] = a[i] + b[i];
}
int main()
{
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50 };
int c[arraySize] = { 0 };
cudaError_t cudaStatus;
int wsize = 1024 * 1024 * 1024;
printf("\r\n");
printf("Waste (unused allocated area) size is: %d\r\n", wsize);
printf("\r\n");
printf("Kernel use Gpu mem waste in gpu mem\r\n");
cudaStatus = addWithCuda(c, a, b, arraySize, false, false, wsize);
printf("\r\n");
printf("Kernel use Gpu mem, waste in unified (cpu) mem\r\n");
cudaStatus = addWithCuda(c, a, b, arraySize, false, true, wsize);
printf("\r\n");
printf("Kernel use unified (cpu) mem, waste in gpu mem\r\n");
cudaStatus = addWithCuda(c, a, b, arraySize, true, false, wsize);
printf("\r\n");
printf("Kernel use unified (cpu) mem, waste in unified (cpu) mem\r\n");
cudaStatus = addWithCuda(c, a, b, arraySize, true, true, wsize);
printf("\r\n");
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
c[0], c[1], c[2], c[3], c[4]);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size, bool managed, bool wmanaged, int wsize)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
int *dev_waste = 0;
int csize = 1 * 1024 * 1024;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
// Allocate GPU buffers for three vectors (two input, one output) .
if (managed)
{
cudaStatus = cudaMallocManaged((void**)&dev_c, csize * sizeof(int));
cudaStatus = cudaMallocManaged((void**)&dev_a, csize * sizeof(int));
cudaStatus = cudaMallocManaged((void**)&dev_b, csize * sizeof(int));
}
else
{
cudaStatus = cudaMalloc((void**)&dev_c, csize * sizeof(int));
cudaStatus = cudaMalloc((void**)&dev_a, csize * sizeof(int));
cudaStatus = cudaMalloc((void**)&dev_b, csize * sizeof(int));
}
//Allocate waste
if (wsize)
{
if (wmanaged)
cudaStatus = cudaMallocManaged((void**)&dev_waste, wsize);
else
cudaStatus = cudaMalloc((void**)&dev_waste, wsize);
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
clock_t t1 = clock();
for (int i = 0; i < 128; i++)
{
// Launch a kernel on the GPU.
dim3 grid(csize / 1024);
dim3 block(1024);
addKernel << <grid, block >> >(dev_c, dev_a, dev_b);
cudaStatus = cudaGetLastError();
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
}
clock_t t2 = clock();
double d = t2 - t1;
d /= CLOCKS_PER_SEC;
d *= 1000;
printf("kernel execution time: %.1f ms\r\n", d);
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
if (wsize)
cudaFree(dev_waste);
return cudaStatus;
}