Hi everyone!
I am reviewing CUDA performance in Jetson TX1.
The goal of project is that Repeating operations in a short time.(less than 4 us)
We measured the kernel function(empty) loading time.( ubuntu GUI stop)
Ave: 0.658500 ms
This time is seems to be longer than I thought.
Reference site:
https://www.cs.virginia.edu/~mwb7w/cuda_support/kernel_overhead.html
Does my board have a problem?
Has anyone ever measured it?
Attach the test code.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
//#include <cutil.h>
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include <helper_timer.h>
StopWatchInterface *timer = NULL;
void startTimer()
{
sdkResetTimer(&timer);
sdkStartTimer(&timer);
}
void endTimer(const char*str)
{
// cudaThreadSynchronize();
sdkStopTimer(&timer);
float elapsed_time = sdkGetTimerValue(&timer);
printf("[%f] - %s\n", elapsed_time, str);
}
float getEndTimer(void)
{
cudaThreadSynchronize();
sdkStopTimer(&timer);
return sdkGetTimerValue(&timer);
}
__global__ void kernel(int x, int y, int z)
{
}
int main(int argc, char** argv) {
float min, max;
float sum = 0;
float elapsed_time;
int count = 10;
sdkCreateTimer(&timer);
startTimer();
kernel<<<750,1024>>>(1,2,3);
endTimer("first call");
for(int i = 0; i < count; i++)
{
startTimer();
kernel<<<750,1024>>>(1,2,3);
elapsed_time = getEndTimer();
sum += elapsed_time;
if(i == 0)
min =max = elapsed_time;
else if(elapsed_time > max)
max = elapsed_time;
else if(elapsed_time < min)
min = elapsed_time;
printf("%f\n", elapsed_time);
}
printf("count:%d max:%f min:%f ave:%f\n", count, max, min, sum/count);
sdkDeleteTimer(&timer);
}