Hello,
I have an issue with the performance of zero-copy on the jetson TX1 board. I followed the instructions found on this forum and came up with the following test program which uses nppi to perform a 45x45 convolution:
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include <nppi.h>
#define UNEXPECTED_ERROR_IF(stat) \
do { \
if(stat) { \
printf("Line %d: Error\n", __LINE__); \
return 1; \
} else { \
printf("Line %d OK\n", __LINE__); \
} \
} while(0)
int conv32(float *img_in, float *img_out, int width, int height, float *kernel, int fwidth, int fheight)
{
NppiSize full, ks;
NppiPoint anchor, zero;
full.width = width;
full.height = height;
ks.width = fwidth;
ks.height = fheight;
anchor.x = fwidth/2;
anchor.y = fheight/2;
zero.x = 0;
zero.y = 0;
UNEXPECTED_ERROR_IF( nppiFilterBorder_32f_C1R(img_in, 4*width, full, zero, img_out, 4*width, full, kernel, ks, anchor, (NppiBorderType)2) );
return 0;
}
int main()
{
float *img_in_cpu = 0, *img_in_gpu = 0;
float *img_out_cpu = 0, *img_out_gpu = 0;
float *kernel_cpu = 0, *kernel_gpu = 0;
float runtime;
int i;
int width = 640, height = 480, fwidth = 45, fheight = 45;
cudaDeviceProp prop;
UNEXPECTED_ERROR_IF( cudaGetDeviceProperties(&prop, 0) );
printf( "canMapHostMemory: %d\n", (int)prop.canMapHostMemory );
UNEXPECTED_ERROR_IF( !prop.canMapHostMemory );
UNEXPECTED_ERROR_IF( cudaSetDevice(0) );
UNEXPECTED_ERROR_IF( cudaSetDeviceFlags(cudaDeviceMapHost) );
UNEXPECTED_ERROR_IF( cudaHostAlloc((void**)&img_in_cpu, sizeof(float)*640*480, cudaHostAllocMapped) );
UNEXPECTED_ERROR_IF( cudaHostAlloc((void**)&img_out_cpu, sizeof(float)*640*480, cudaHostAllocMapped) );
UNEXPECTED_ERROR_IF( cudaHostAlloc((void**)&kernel_cpu, sizeof(float)*45*45, cudaHostAllocMapped) );
printf("cpu pointers: %p %p %p\n", img_in_cpu, img_out_cpu, kernel_cpu);
#if 1
UNEXPECTED_ERROR_IF( cudaHostGetDevicePointer( (void**)&img_in_gpu, (void*)img_in_cpu, 0) );
UNEXPECTED_ERROR_IF( cudaHostGetDevicePointer( (void**)&img_out_gpu, (void*)img_out_cpu, 0) );
UNEXPECTED_ERROR_IF( cudaHostGetDevicePointer( (void**)&kernel_gpu, (void*)kernel_cpu, 0) );
#else
img_in_gpu = img_in_cpu;
img_out_gpu = img_out_cpu;
kernel_gpu = kernel_cpu;
#endif
printf("gpu pointers: %p %p %p\n", img_in_gpu, img_out_gpu, kernel_gpu);
for( i = 0; i < width*height; i++ )
{
img_in_cpu[i] = (float)rand() / (float)RAND_MAX;
img_out_cpu[i] = 0.f;
}
for( i = 0; i < fwidth*fheight; i++ )
{
kernel_cpu[i] = 0.f;
}
kernel_cpu[(fheight/2)*fwidth + (fwidth/2)] = 1.f;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
UNEXPECTED_ERROR_IF( conv32(img_in_gpu, img_out_gpu, width, height, kernel_gpu, fwidth, fheight) );
cudaEventRecord(stop);
cudaEventSynchronize(stop);
UNEXPECTED_ERROR_IF( cudaEventElapsedTime( &runtime, start, stop) );
printf( "Convolution done, runtime: %.1f ms\n", runtime );
}
The output of the program is as follows:
Line 43 OK
canMapHostMemory: 1
Line 45 OK
Line 46 OK
Line 47 OK
Line 49 OK
Line 50 OK
Line 51 OK
cpu pointers: 0x100be0000 0x100d20000 0x100e60000
Line 55 OK
Line 56 OK
Line 57 OK
gpu pointers: 0x100be0000 0x100d20000 0x100e60000
Line 29 OK
Line 81 OK
Line 84 OK
Convolution done, runtime: 3968.8 ms
Now, when I perform the same operation using a classical memcpy-based approach, I get kernel runtimes of around 22 ms (neglecting the transfer times). The test program for this is written in python, see here:
# benchmark gpu 32bit floating point convolution
from ctypes import *
from pycuda import gpuarray, autoinit
from scipy.ndimage import filters as scfi
import time
import sys
import numpy as np
npp = CDLL("libnppi.so")
class NppiSize(Structure):
_fields_ = [("width", c_int), ("height", c_int)]
class NppiPoint(Structure):
_fields_ = [("x", c_int), ("y", c_int)]
npp.nppiFilterBorder_32f_C1R.argtypes = [c_void_p, c_int, NppiSize, NppiPoint,
c_void_p, c_int, NppiSize, c_void_p, NppiSize,
NppiPoint, c_int]
npp.nppiFilterBorder_32f_C1R.restype = c_int
def conv32(input, kernel, output):
assert( input.dtype == np.float32 and kernel.dtype == np.float32 and output.dtype == np.float32 and input.shape == output.shape )
full = NppiSize()
full.width = input.shape[1]
full.height = input.shape[0]
ks = NppiSize()
ks.width = kernel.shape[1]
ks.height = kernel.shape[0]
anchor = NppiPoint()
anchor.x = kernel.shape[1]//2
anchor.y = kernel.shape[0]//2
zero = NppiPoint()
zero.x = 0
zero.y = 0
res = npp.nppiFilterBorder_32f_C1R(int(input.gpudata ), 4*input.shape[1] , full, zero,
int(output.gpudata), 4*output.shape[1], full, int(kernel.gpudata), ks, anchor, 2)
assert res == 0
return output
def profconv(I,O,K,num_runs=10,burst_size=100):
t = []
lastT = time.time()
for i in range(num_runs):
for b in range(burst_size):
O = conv32(I, K, O)
autoinit.context.synchronize()
ct = time.time()
t.append((ct-lastT)/burst_size)
lastT = ct
runtimes = np.array(t) * 1000.
flop_per_conv = I.size*K.size
print("Image dim %3d x %3d kernel dim %3d x %3d: runtime: avg=%7.2f ms std=%7.2fms first=%7.2f GFLOPS: avg=%7.3f" %
(I.shape[0], I.shape[1], K.shape[0], K.shape[1], np.mean(runtimes[1:]), np.std(runtimes[1:]), runtimes[0], flop_per_conv/(np.mean(runtimes[1:])/1000.)/(1024*1024*1024) ))
#print(t)
#print(runtimes)
ot = scfi.convolve(I.get(), K.get(), mode='nearest')
eq = np.allclose(ot,O.get())
if not np.all(eq):
print("I=",I)
print("K=",K)
print("eq=",eq)
print("O=",O)
print("ot=",ot)
return O
if __name__ == "__main__":
num_runs = 10
burst_size = 10
if len(sys.argv) > 1:
num_runs = int(sys.argv[1])
if len(sys.argv) > 2:
burst_size = int(sys.argv[2])
I = gpuarray.to_gpu(np.reshape( np.arange(9, dtype=np.float32), (3,3)))
O = gpuarray.zeros( I.shape, dtype=np.float32)
K = gpuarray.zeros( (3,3), dtype=np.float32 ) + 1/9.
O = profconv(I,O,K)
I = gpuarray.to_gpu(np.random.rand( 640, 480).astype(np.float32))
O = gpuarray.zeros( I.shape, dtype=np.float32 )
for ks in [(3,3), (3,5), (3,7), (3,9), (5,3), (5,5), (5,7), (5,9), (7,3), (7,5), (7,7), (7,9), (9,3), (9,5), (9,7), (9,9), (15,15), (25,25), (35,35), (45,45)]:
K = gpuarray.to_gpu(np.random.rand(ks[0], ks[1]).astype(np.float32))
O = profconv(I,O,K, num_runs=num_runs, burst_size=burst_size)
and the output is this:
Image dim 3 x 3 kernel dim 3 x 3: runtime: avg= 0.06 ms std= 0.00ms first= 9.68 GFLOPS: avg= 0.001
Image dim 640 x 480 kernel dim 3 x 3: runtime: avg= 0.63 ms std= 0.23ms first= 0.82 GFLOPS: avg= 4.109
Image dim 640 x 480 kernel dim 3 x 5: runtime: avg= 0.71 ms std= 0.15ms first= 0.87 GFLOPS: avg= 6.075
Image dim 640 x 480 kernel dim 3 x 7: runtime: avg= 0.61 ms std= 0.03ms first= 0.64 GFLOPS: avg= 9.855
Image dim 640 x 480 kernel dim 3 x 9: runtime: avg= 0.62 ms std= 0.01ms first= 0.64 GFLOPS: avg= 12.400
Image dim 640 x 480 kernel dim 5 x 3: runtime: avg= 0.65 ms std= 0.01ms first= 0.68 GFLOPS: avg= 6.642
Image dim 640 x 480 kernel dim 5 x 5: runtime: avg= 0.42 ms std= 0.00ms first= 0.42 GFLOPS: avg= 16.875
Image dim 640 x 480 kernel dim 5 x 7: runtime: avg= 0.77 ms std= 0.02ms first= 0.77 GFLOPS: avg= 12.963
Image dim 640 x 480 kernel dim 5 x 9: runtime: avg= 0.85 ms std= 0.02ms first= 0.89 GFLOPS: avg= 15.085
Image dim 640 x 480 kernel dim 7 x 3: runtime: avg= 0.80 ms std= 0.02ms first= 0.83 GFLOPS: avg= 7.551
Image dim 640 x 480 kernel dim 7 x 5: runtime: avg= 0.86 ms std= 0.04ms first= 0.88 GFLOPS: avg= 11.605
Image dim 640 x 480 kernel dim 7 x 7: runtime: avg= 0.99 ms std= 0.02ms first= 1.02 GFLOPS: avg= 14.137
Image dim 640 x 480 kernel dim 7 x 9: runtime: avg= 1.08 ms std= 0.03ms first= 1.12 GFLOPS: avg= 16.767
Image dim 640 x 480 kernel dim 9 x 3: runtime: avg= 0.92 ms std= 0.03ms first= 0.95 GFLOPS: avg= 8.353
Image dim 640 x 480 kernel dim 9 x 5: runtime: avg= 1.01 ms std= 0.03ms first= 1.05 GFLOPS: avg= 12.795
Image dim 640 x 480 kernel dim 9 x 7: runtime: avg= 1.16 ms std= 0.03ms first= 1.21 GFLOPS: avg= 15.475
Image dim 640 x 480 kernel dim 9 x 9: runtime: avg= 1.29 ms std= 0.03ms first= 1.36 GFLOPS: avg= 18.019
Image dim 640 x 480 kernel dim 15 x 15: runtime: avg= 2.77 ms std= 0.06ms first= 2.90 GFLOPS: avg= 23.268
Image dim 640 x 480 kernel dim 25 x 25: runtime: avg= 6.53 ms std= 0.04ms first= 6.96 GFLOPS: avg= 27.371
Image dim 640 x 480 kernel dim 35 x 35: runtime: avg= 13.08 ms std= 0.03ms first= 14.05 GFLOPS: avg= 26.792
Image dim 640 x 480 kernel dim 45 x 45: runtime: avg= 21.85 ms std= 0.05ms first= 23.52 GFLOPS: avg= 26.516
I don’t have an explanation for the massive performance degradation introduced by the zero-copy variant. Can anyone help me out here?
Here is the system configuration:
# lscpu
Architecture: aarch64
Byte Order: Little Endian
CPU(s): 4
On-line CPU(s) list: 0-3
Thread(s) per core: 1
Core(s) per socket: 1
Socket(s): 4
Model name: ARMv8 Processor rev 1 (v8l)
CPU max MHz: 1734.0000
CPU min MHz: 102.0000
Hypervisor vendor: (null)
Virtualization type: full
# uname -mrs
Linux 3.10.96-tegra aarch64
# lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 16.04.2 LTS
Release: 16.04
Codename: xenial