I have been trying to determine where the bottleneck in my program is and have discovered that in windows (I havnt tested in linux yet) a kernel launch followed by a synchronize call is extremely expensive : Approximately 15x more expensive than two regular kernel launches (non-synced).
Is this true? Have I made a mistake? If so how can I fix it?
Here are my results:
NO CALLS :
0.003 us
streamSync :
3.52 us
deviceSync :
3.48 us
empty<<<1,64,0,stream>>> + streamSync :
118.53 us
empty<<<1,64>>> + deviceSync :
119.46 us
Here is my code (run in release under cuda 4.2, driver 302.49, windows server 2008, gtx 580)
#include "cuda_runtime.h"
#include <stdio.h>
#ifdef _WIN32 || _WIN64
#include <windows.h>
class Clock
{
public:
Clock() {QueryPerformanceFrequency(&_freq);}
inline void tic() {QueryPerformanceCounter(&_perf0);}
inline double toc() {QueryPerformanceCounter(&_perf1); return ((double)(_perf1.QuadPart-_perf0.QuadPart))/(double)_freq.QuadPart;}
private:
LARGE_INTEGER _freq,_perf0,_perf1;
};
#elif defined __linux__ || defined TARGET_OS_MAC
#include <time.h>
class Clock
{
public:
Clock() {_freq = 1000*1000*1000;}
inline void tic() {clock_gettime(CLOCK_MONOTONIC,&_ts0);}
inline double toc() {clock_gettime(CLOCK_MONOTONIC,&_ts1); return (double)(nanos(_freq, _ts1) - nanos(_freq,_ts0))/(double)_freq;}
private:
inline long long nanos(long long f,timespec t) {return f*t.tv_sec + t.tv_nsec; }
timespec _ts0,_ts1;
long long _freq;
};
#else #error "Platform not supported."
#endif
__global__ void emptyKernel() { }
int main()
{
int N= 10000;
volatile double t;
Clock clk;
cudaSetDevice(0);
//Create a stream so we can test the difference between default stream and non-default
cudaStream_t stream;
cudaStreamCreate( &stream);
//-----------------NO CALLS-----------------------
clk.tic();
for (volatile int n = 0; n < N; n++){
}
t = clk.toc() *1000*1000;
printf("\nNO CALLS :\n%.3f us\n",t/N);
//-----------------streamSync------------------
clk.tic();
for (volatile int n = 0; n < N; n++){
cudaStreamSynchronize(stream);
}
t = clk.toc() *1000*1000;
printf("\nstreamSync :\n%.2f us\n",t/N);
//-----------------deviceSync-----------------------
clk.tic();
for (volatile int n = 0; n < N; n++){
cudaDeviceSynchronize();
}
t = clk.toc() *1000*1000;
printf("\ndeviceSync :\n%.2f us\n",t/N);
//-----------------empty<<<1,64,0,stream>>> + streamSync----------------------
clk.tic();
for (volatile int n = 0; n < N; n++){
emptyKernel<<<1,64,0,stream>>>();
cudaStreamSynchronize(stream);
}
t = clk.toc() *1000*1000;
printf("\nempty<<<1,64,0,stream>>> + streamSync :\n%.2f us\n",t/N);
//-----------------empty<<<1,64>>> + deviceSync----------------------
clk.tic();
for (volatile int n = 0; n < N; n++){
emptyKernel<<<1,64>>>();
cudaDeviceSynchronize();
}
t = clk.toc() *1000*1000;
printf("\nempty<<<1,64>>> + deviceSync :\n%.2f us\n",t/N);
cudaDeviceReset();
getchar();
return 0;
}