Hi,
I’m currently doing a project with CUDA where a pipeline is refreshed with 200-10000 new events every 1ms. Each time, I want to call one(/two) kernels which compute a small list of outputs; then fed those outputs to the next element of the pipeline.
The theoretical flow is:
- receive data in an std::vector
- cudaMemcpy the vector to GPU
- processing
- generate small list of outputs
- cudaMemcpy to the output std::vector (that's where I'm using cudaDeviceSynchronize for testing purpose in the meantime)
But when I’m calling cudaDeviceSynchronize on a 1block/1thread empty kernel with no processing, it already takes in average 500us to 900us, which is already almost higher than my 1ms timeframe.
I could eventually change the timeframe of the pipeline in order to receive events every 5ms, but with 5x more each times. It wouldn’t be ideal though.
What would be the best way to minimize the overhead of launching cudaDeviceSynchronize? Could streams be helpful in this situation? Or another solution to efficiently run the pipeline.
I’m using a Jetson TK1 with Ubuntu 14.04, I’m accessing the board via SSH.
Thanks for any tips and help,
Here’s a nvprof log of the applications:
The average for the Avg of cudaDeviceSynchronize is around 600us, ranging from 500us to 900us
==20839== Profiling application: python pthing.py input.rec
==20839== Profiling result:
Time(%) Time Calls Avg Min Max Name
77.62% 13.273ms 1325 10.017us 9.6660us 10.667us empty_kernel(void)
22.38% 3.8273ms 8 478.42us 150.33us 615.83us [CUDA memset]
==20839== API calls:
Time(%) Time Calls Avg Min Max Name
53.29% 800.01ms 1325 603.78us 15.000us 5.3809ms cudaDeviceSynchronize (<<<1,1>>>)
29.58% 444.13ms 1325 335.19us 91.750us 2.6252ms cudaLaunch
16.13% 242.08ms 9 26.898ms 39.083us 239.06ms cudaMalloc
0.83% 12.501ms 1325 9.4350us 2.4160us 681.67us cudaConfigureCall
0.15% 2.2661ms 8 283.26us 175.00us 340.42us cudaMemset
0.01% 207.75us 83 2.5030us 666ns 78.167us cuDeviceGetAttribute
0.00% 6.4180us 2 3.2090us 1.0840us 5.3340us cuDeviceGetCount
0.00% 2.6660us 1 2.6660us 2.6660us 2.6660us cuDeviceTotalMem
0.00% 2.0000us 2 1.0000us 1.0000us 1.0000us cuDeviceGet
0.00% 1.9170us 1 1.9170us 1.9170us 1.9170us cuDeviceGetName
A small reconstitution of the program - for some reason, the average of cudaDeviceSynchronize is 3 times lower, but it’s still really high for an empty 1-thread kernel:
With the source code below, the TK1 takes around 250us, the TX1 around 100us, and my laptop around 4us.
/* Compiled with `nvcc -I/usr/local/cuda/samples/common/inc test.cu -o t`
* Profiled with `nvprof --profile-from-start off ./t`
**/
#include <iostream>
#include <cuda.h>
#include <helper_cuda.h>
#include <cuda_profiler_api.h>
#define MAX_INPUT_BUFFER_SIZE 131072
typedef struct {
unsigned short x;
unsigned short y;
short a;
long long b;
} Event;
long long *d_a_[2], *d_b_[2];
float *d_as_, *d_bs_;
bool *d_some_bool_[2];
Event *d_data_;
int width_ = 320;
int height_ = 240;
__global__ void reset_timesurface(long long ts,
long long *d_a_0, long long *d_a_1,
long long *d_b_0, long long *d_b_1,
float *d_as, float *d_bs,
bool *d_some_bool_0, bool *d_some_bool_1, Event *d_data) {
// nothing here
}
void reset_errors(long long ts) {
static const int n = 1024;
static const dim3 grid_size(width_ * height_ / n
+ (width_ * height_ % n != 0), 1, 1);
static const dim3 block_dim(n, 1, 1);
reset_timesurface<<<1, 1>>>(ts, d_a_[0], d_a_[1],
d_b_[0], d_b_[1],
d_as_, d_bs_,
d_some_bool_[0], d_some_bool_[1], d_data_);
cudaDeviceSynchronize();
// static long long *h_holder = (long long*)malloc(sizeof(long long) * 2000);
// cudaMemcpy(h_holder, d_a_[0], 0, cudaMemcpyDeviceToHost);
}
int main(void) {
checkCudaErrors(cudaMalloc(&(d_a_[0]), sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMemset(d_a_[0], 0, sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMalloc(&(d_a_[1]), sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMemset(d_a_[1], 0, sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMalloc(&(d_b_[0]), sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMemset(d_b_[0], 0, sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMalloc(&(d_b_[1]), sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMemset(d_b_[1], 0, sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMalloc(&d_as_, sizeof(float)*width_*height_*2));
checkCudaErrors(cudaMemset(d_as_, 0, sizeof(float)*width_*height_*2));
checkCudaErrors(cudaMalloc(&d_bs_, sizeof(float)*width_*height_*2));
checkCudaErrors(cudaMemset(d_bs_, 0, sizeof(float)*width_*height_*2));
checkCudaErrors(cudaMalloc(&(d_some_bool_[0]), sizeof(bool)*width_*height_*2));
checkCudaErrors(cudaMemset(d_some_bool_[0], 0, sizeof(bool)*width_*height_*2));
checkCudaErrors(cudaMalloc(&(d_some_bool_[1]), sizeof(bool)*width_*height_*2));
checkCudaErrors(cudaMemset(d_some_bool_[1], 0, sizeof(bool)*width_*height_*2));
checkCudaErrors(cudaMalloc(&d_data_, sizeof(Event)*MAX_INPUT_BUFFER_SIZE));
reset_errors(16487L); //warmup
cudaProfilerStart();
for (int i = 0; i < 5005; ++i)
reset_errors(16487L);
cudaProfilerStop();
cudaFree(d_a_[0]);
cudaFree(d_a_[1]);
cudaFree(d_b_[0]);
cudaFree(d_b_[1]);
cudaFree(d_as_);
cudaFree(d_bs_);
cudaFree(d_some_bool_[0]);
cudaFree(d_some_bool_[1]);
cudaFree(d_data_);
cudaDeviceReset();
}
And the output from this test program, launched on my Jetson TK1:
The average for the Avg of cudaDeviceSynchronize is around 240us, ranging from 150 to 400us
==20355== NVPROF is profiling process 20355, command: ./t
==20355== Profiling application: ./t
==20355== Profiling result:
Time(%) Time Calls Avg Min Max Name
100.00% 46.497ms 5005 9.2900us 6.4160us 13.167us reset_timesurface(__int64, __int64*, __int64*, __int64*, __int64*, float*, float*, bool*, bool*, Event*)
==20355== API calls:
Time(%) Time Calls Avg Min Max Name
68.09% 1.75938s 5005 351.52us 9.2500us 11.732ms cudaDeviceSynchronize
25.41% 656.50ms 5005 131.17us 45.916us 8.1398ms cudaLaunch
5.82% 150.45ms 50050 3.0050us 833ns 7.3719ms cudaSetupArgument
0.68% 17.661ms 5005 3.5280us 1.1660us 980.75us cudaConfigureCall