I wan’t to do some Image Processing on my Jetson TX1, but i’m struggling with a big bottleneck on the load of my kernel. Even with a empty kernel(see Code below) i get 10ms execution Time?
__global__ void just_copy(uint8_t *in, uint8_t *out, uint32_t imgw, uint32_t imgh)
{
}
float elapsed=0;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
dim3 threads(16, 16);
dim3 grid(240,135);
just_copy<<<grid, threads>>>(d_src, d_dst, width, height);
cudaEventRecord(stop, 0);
cudaEventSynchronize (stop);
cudaEventElapsedTime(&elapsed, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("The elapsed time in gpu was %.2f ms\n", elapsed);