performance problem

I am learning cuda programming. I encounter a performance problem which I couldn’t explain. It is related to streaming.

my PC:
Intel® Core™ i7-7700K CPU @ 4.20GHz + GeForce GTX 1060 6GB (4095 MB)

Code 1(without streaming, synchronous memory copy) takes about 70ms
Code 2(with one streaming, asynchronous memory copy) takes about 23ms
What puzzle me is that Code 2 has only one stream, but the performance improves greatly compared to Code 1,
In my opinion, stream contribute to performace by pipeline, but there should be at least two streams.But why
one stream works in my case?

/Code 1 begin****************************************/
#include <stdio.h>
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#define HANDLE_ERROR( err ) (err)
#define N (10241024)
#define FULL_DATA_SIZE (N
20)

global void kernel( int *a, int *b, int *c ) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
int idx1 = (idx + 1) % 256;
int idx2 = (idx + 2) % 256;
float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
c[idx] = (as + bs) / 2;
}
}

int main( void ) {
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR( cudaGetDevice( &whichDevice ) );
HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) );
if (!prop.deviceOverlap) {
printf( “Device will not handle overlaps, so no speed up from streams\n” );
return 0;
}

cudaEvent_t     start, stop;
float           elapsedTime;

cudaStream_t    stream;
int *host_a, *host_b, *host_c;
int *dev_a, *dev_b, *dev_c;

// start the timers
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );

// initialize the stream

// HANDLE_ERROR( cudaStreamCreate( &stream ) );

// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
                          N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
                          N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c,
                          N * sizeof(int) ) );

// allocate host locked memory, used to stream
host_a= (int*)malloc( FULL_DATA_SIZE * sizeof(int) );
host_b = (int*)malloc(FULL_DATA_SIZE * sizeof(int) );
host_c = (int*)malloc(FULL_DATA_SIZE * sizeof(int) );

for (int i=0; i<FULL_DATA_SIZE; i++) {
    host_a[i] = rand();
    host_b[i] = rand();
}

HANDLE_ERROR( cudaEventRecord( start, 0 ) );
// now loop over full data, in bite-sized chunks
for (int i=0; i<FULL_DATA_SIZE; i+= N) {
    // copy the locked memory to the device, async
    HANDLE_ERROR( cudaMemcpy( dev_a, host_a+i,
		N * sizeof(int),
                                   cudaMemcpyHostToDevice) );
    HANDLE_ERROR( cudaMemcpy( dev_b, host_b+i,
		N * sizeof(int),
                                   cudaMemcpyHostToDevice) );

    kernel<<<N/256,256>>>( dev_a, dev_b, dev_c );

    // copy the data from device to locked memory
    HANDLE_ERROR( cudaMemcpy( host_c+i, dev_c,
		N * sizeof(int),
                                   cudaMemcpyDeviceToHost) );

}
// copy result chunk from locked to full buffer
//HANDLE_ERROR( cudaStreamSynchronize( stream ) );

HANDLE_ERROR( cudaEventRecord( stop, 0 ) );

HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
                                    start, stop ) );
printf( "Time taken:  %3.1f ms\n", elapsedTime );

// cleanup the streams and memory
//HANDLE_ERROR( cudaFreeHost( host_a ) );
//HANDLE_ERROR( cudaFreeHost( host_b ) );
//HANDLE_ERROR( cudaFreeHost( host_c ) );
HANDLE_ERROR( cudaFree( dev_a ) );
HANDLE_ERROR( cudaFree( dev_b ) );
HANDLE_ERROR( cudaFree( dev_c ) );
//HANDLE_ERROR( cudaStreamDestroy( stream ) );
system("pause");
return 0;

}
/Code 1 end****************************************/

/Code 2 begin****************************************/
#include <stdio.h>

#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#define HANDLE_ERROR( err ) (err)

#define N (10241024)
#define FULL_DATA_SIZE (N
20)

global void kernel( int *a, int *b, int *c ) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
int idx1 = (idx + 1) % 256;
int idx2 = (idx + 2) % 256;
float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
c[idx] = (as + bs) / 2;
}
}

int main( void ) {
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR( cudaGetDevice( &whichDevice ) );
HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) );
if (!prop.deviceOverlap) {
printf( “Device will not handle overlaps, so no speed up from streams\n” );
return 0;
}

cudaEvent_t     start, stop;
float           elapsedTime;

cudaStream_t    stream;
int *host_a, *host_b, *host_c;
int *dev_a, *dev_b, *dev_c;

// start the timers
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );

// initialize the stream
HANDLE_ERROR( cudaStreamCreate( &stream ) );

// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
                          N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
                          N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c,
                          N * sizeof(int) ) );

// allocate host locked memory, used to stream
HANDLE_ERROR( cudaHostAlloc( (void**)&host_a,
                          FULL_DATA_SIZE * sizeof(int),
                          cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_b,
                          FULL_DATA_SIZE * sizeof(int),
                          cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_c,
                          FULL_DATA_SIZE * sizeof(int),
                          cudaHostAllocDefault ) );

for (int i=0; i<FULL_DATA_SIZE; i++) {
    host_a[i] = rand();
    host_b[i] = rand();
}

HANDLE_ERROR( cudaEventRecord( start, 0 ) );
// now loop over full data, in bite-sized chunks
for (int i=0; i<FULL_DATA_SIZE; i+= N) {
    // copy the locked memory to the device, async
    HANDLE_ERROR( cudaMemcpyAsync( dev_a, host_a+i,
                                   N * sizeof(int),
                                   cudaMemcpyHostToDevice,
                                   stream ) );
    HANDLE_ERROR( cudaMemcpyAsync( dev_b, host_b+i,
                                   N * sizeof(int),
                                   cudaMemcpyHostToDevice,
                                   stream ) );

    kernel<<<N/256,256,0,stream>>>( dev_a, dev_b, dev_c );

    // copy the data from device to locked memory
    HANDLE_ERROR( cudaMemcpyAsync( host_c+i, dev_c,
                                   N * sizeof(int),
                                   cudaMemcpyDeviceToHost,
                                   stream ) );

}
// copy result chunk from locked to full buffer
HANDLE_ERROR( cudaStreamSynchronize( stream ) );

HANDLE_ERROR( cudaEventRecord( stop, 0 ) );

HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
                                    start, stop ) );
printf( "Time taken:  %3.1f ms\n", elapsedTime );

// cleanup the streams and memory
HANDLE_ERROR( cudaFreeHost( host_a ) );
HANDLE_ERROR( cudaFreeHost( host_b ) );
HANDLE_ERROR( cudaFreeHost( host_c ) );
HANDLE_ERROR( cudaFree( dev_a ) );
HANDLE_ERROR( cudaFree( dev_b ) );
HANDLE_ERROR( cudaFree( dev_c ) );
HANDLE_ERROR( cudaStreamDestroy( stream ) );
system("pause");
return 0;

}

/Code 2 end****************************************/

Most of the time spent in your application is in the Host to Device data transfers. When you use pinned buffers, these transfers take less time.

In your first case you are not using pinned buffers, and the transfers take longer.

In the second case you are using pinned buffers, and the data transfers are quicker.

This is not a valid comparison if you are focused on stream behavior.

run your code with nvprof and you will see the timing difference.

Thanks,It make sense, I run nvprof and found CPU have eaten more time than GPU.

//case1:paged memory
C:\Users\ss>nvprof d:\tmp\Release\test.exe
==11940== NVPROF is profiling process 11940, command: d:\tmp\Release\test.exe
Time taken: 74.3 ms
==11940== Profiling application: d:\tmp\Release\test.exe
==11940== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 54.53% 26.415ms 40 660.37us 615.49us 880.26us [CUDA memcpy HtoD]
41.74% 20.217ms 20 1.0109ms 895.33us 1.1776ms [CUDA memcpy DtoH]
3.73% 1.8081ms 20 90.403us 89.472us 91.584us kernel(int*, int*, int*)
API calls: 69.93% 181.89ms 2 90.945ms 975ns 181.89ms cudaEventCreate
28.50% 74.138ms 60 1.2356ms 762.39us 2.0736ms cudaMemcpy
0.48% 1.2476ms 3 415.86us 351.33us 467.38us cudaMalloc
0.46% 1.1986ms 94 12.750us 0ns 713.14us cuDeviceGetAttribute
0.22% 564.91us 1 564.91us 564.91us 564.91us cudaGetDeviceProperties
0.15% 383.03us 3 127.68us 110.69us 156.04us cudaFree
0.14% 369.13us 1 369.13us 369.13us 369.13us cuDeviceGetName
0.08% 216.26us 20 10.812us 7.0700us 34.865us cudaLaunch
0.01% 25.356us 2 12.678us 3.6570us 21.699us cudaEventRecord
0.01% 20.724us 1 20.724us 20.724us 20.724us cudaEventSynchronize
0.00% 12.434us 1 12.434us 12.434us 12.434us cudaGetDevice
0.00% 11.698us 60 194ns 0ns 975ns cudaSetupArgument
0.00% 11.216us 20 560ns 243ns 1.9510us cudaConfigureCall
0.00% 8.2900us 1 8.2900us 8.2900us 8.2900us cudaEventElapsedTime
0.00% 6.3390us 1 6.3390us 6.3390us 6.3390us cuDeviceTotalMem
0.00% 1.9510us 3 650ns 244ns 1.4630us cuDeviceGetCount
0.00% 976ns 2 488ns 244ns 732ns cuDeviceGet

//case 2: pinned memory
C:\Users\ss>nvprof d:\tmp\cuda\memcpy\pinnedmem\test.exe
==17072== NVPROF is profiling process 17072, command: d:\tmp\cuda\memcpy\pinnedmem\test.exe
Time taken: 22.0 ms
==17072== Profiling application: d:\tmp\cuda\memcpy\pinnedmem\test.exe
==17072== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 61.91% 13.354ms 40 333.85us 332.39us 345.89us [CUDA memcpy HtoD]
29.69% 6.4038ms 20 320.19us 315.68us 333.86us [CUDA memcpy DtoH]
8.40% 1.8119ms 20 90.593us 89.536us 97.792us kernel(int*, int*, int*)
API calls: 51.43% 95.007ms 2 47.503ms 1.2190us 95.005ms cudaEventCreate
24.31% 44.913ms 3 14.971ms 14.888ms 15.026ms cudaHostAlloc
11.46% 21.169ms 1 21.169ms 21.169ms 21.169ms cudaStreamSynchronize
10.90% 20.144ms 3 6.7146ms 6.3983ms 7.1036ms cudaFreeHost
0.53% 970.60us 3 323.53us 315.49us 333.29us cudaMalloc
0.34% 622.93us 3 207.64us 186.76us 236.50us cudaFree
0.33% 612.94us 60 10.215us 2.6820us 34.133us cudaMemcpyAsync
0.21% 379.37us 20 18.968us 15.848us 59.246us cudaLaunch
0.20% 370.59us 94 3.9420us 0ns 162.38us cuDeviceGetAttribute
0.19% 341.82us 1 341.82us 341.82us 341.82us cudaGetDeviceProperties
0.05% 87.040us 1 87.040us 87.040us 87.040us cuDeviceGetName
0.01% 25.843us 1 25.843us 25.843us 25.843us cudaEventSynchronize
0.01% 24.137us 2 12.068us 2.9260us 21.211us cudaEventRecord
0.01% 19.992us 1 19.992us 19.992us 19.992us cudaStreamDestroy
0.01% 17.311us 1 17.311us 17.311us 17.311us cudaStreamCreate
0.01% 10.240us 1 10.240us 10.240us 10.240us cudaGetDevice
0.01% 9.2650us 60 154ns 0ns 975ns cudaSetupArgument
0.00% 8.5340us 1 8.5340us 8.5340us 8.5340us cudaEventElapsedTime
0.00% 5.8510us 20 292ns 0ns 1.4620us cudaConfigureCall
0.00% 5.3630us 1 5.3630us 5.3630us 5.3630us cuDeviceTotalMem
0.00% 1.4630us 3 487ns 0ns 975ns cuDeviceGetCount
0.00% 1.2180us 2 609ns 243ns 975ns cuDeviceGet

//case 3: unified memory
C:\Users\ss>nvprof d:\tmp\cuda\memcpy\unifiedmem\Release\test.exe
==2768== NVPROF is profiling process 2768, command: d:\tmp\cuda\memcpy\unifiedmem\Release\test.exe
Time taken: 2.0 ms
==2768== Profiling application: d:\tmp\cuda\memcpy\unifiedmem\Release\test.exe
==2768== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 1.9276ms 20 96.378us 90.144us 146.69us kernel(int*, int*, int*)
API calls: 72.16% 448.95ms 20 22.448ms 3.4130us 448.85ms cudaLaunch
15.64% 97.289ms 2 48.645ms 1.2190us 97.288ms cudaEventCreate
9.12% 56.713ms 3 18.904ms 11.421ms 32.832ms cudaMallocManaged
2.61% 16.233ms 3 5.4110ms 4.7445ms 5.9716ms cudaFree
0.33% 2.0334ms 1 2.0334ms 2.0334ms 2.0334ms cudaDeviceSynchronize
0.06% 370.11us 94 3.9370us 0ns 162.62us cuDeviceGetAttribute
0.05% 338.41us 1 338.41us 338.41us 338.41us cudaGetDeviceProperties
0.01% 88.991us 1 88.991us 88.991us 88.991us cuDeviceGetName
0.01% 46.812us 1 46.812us 46.812us 46.812us cudaEventSynchronize
0.00% 25.601us 2 12.800us 4.1450us 21.456us cudaEventRecord
0.00% 10.485us 60 174ns 0ns 975ns cudaSetupArgument
0.00% 10.484us 1 10.484us 10.484us 10.484us cudaGetDevice
0.00% 8.2890us 1 8.2890us 8.2890us 8.2890us cudaEventElapsedTime
0.00% 5.8510us 1 5.8510us 5.8510us 5.8510us cuDeviceTotalMem
0.00% 5.6090us 20 280ns 0ns 1.7060us cudaConfigureCall
0.00% 1.4630us 3 487ns 244ns 975ns cuDeviceGetCount
0.00% 975ns 2 487ns 244ns 731ns cuDeviceGet

==2768== Unified Memory profiling result:
Device “GeForce GTX 1060 6GB (0)”
Count Avg Size Min Size Max Size Total Size Total Time Name
40960 4.0000KB 4.0000KB 4.0000KB 160.0000MB 213.7575ms Host To Device
5120 32.000KB 32.000KB 32.000KB 160.0000MB 49.41209ms Device To Host

C:\Users\ss>