Cuda performance randomly improves in a quantized manner

I’m experiencing the problem that the performance of cuda seems to be artificially limited and randomly improves while run-time in a quantized manner. I see this behavior with the TX2, but not with the TX1. My question is how I can always get maximal performance instead.

For demonstration I wrote a little test-application which renders the mandelbrot-set over and over again. This is an example output it creates:

1s, 33.5 fps (kernel: 0.018ms + sync: 24.822ms = 24.841ms)
2s, 40.2 fps (kernel: 0.018ms + sync: 24.817ms = 24.835ms)
3s, 40.2 fps (kernel: 0.018ms + sync: 24.808ms = 24.827ms)
4s, 40.2 fps (kernel: 0.018ms + sync: 24.827ms = 24.846ms)
5s, 40.2 fps (kernel: 0.018ms + sync: 24.825ms = 24.844ms)
6s, 47.6 fps (kernel: 0.019ms + sync: 18.750ms = 18.769ms)
7s, 53.3 fps (kernel: 0.018ms + sync: 18.742ms = 18.760ms)
8s, 53.3 fps (kernel: 0.018ms + sync: 18.751ms = 18.769ms)
9s, 53.3 fps (kernel: 0.019ms + sync: 18.744ms = 18.762ms)
10s, 53.3 fps (kernel: 0.018ms + sync: 18.753ms = 18.771ms)
11s, 53.3 fps (kernel: 0.018ms + sync: 18.749ms = 18.767ms)

You can see that after six seconds in this case the performance suddenly improves. But this behavior is random. Each time I run the application the performances and moments in time when it improves are different.

This is the complete source-code of the application:

#include <iostream>
#include <chrono>
#include <cuda.h>

//#define SAVE_TO_DISK
#ifdef SAVE_TO_DISK
#include <cairo.h>
#endif

using FloatType = float;
constexpr int MB_WIDTH = 2048;
constexpr int MB_HEIGHT = MB_WIDTH;
constexpr FloatType MB_ASPECT = (FloatType)MB_WIDTH/MB_HEIGHT;
constexpr int MB_BLOCK_WIDTH = 32;
constexpr int MB_BLOCK_HEIGHT = 32;
constexpr int MAX_ITER = 256;

__global__ void mandelbrotK(uint32_t *pixelsOut, FloatType tx, FloatType ty, FloatType s) {
	const int x = blockIdx.x*blockDim.x + threadIdx.x;
	const int y = blockIdx.y*blockDim.y + threadIdx.y;

	const FloatType cRe = s*((FloatType)x/MB_WIDTH-FloatType(0.5)) + tx;
	const FloatType cIm = s/MB_ASPECT*((FloatType)y/MB_HEIGHT-FloatType(0.5)) + ty;
	FloatType zRe = 0;
	FloatType zIm = 0;
	int it;
	for (it = 0; it < MAX_ITER && zRe*zRe+zIm*zIm < 2*2; ++it) {
		const FloatType temp = zRe*zRe - zIm*zIm + cRe;
		zIm = 2*zRe*zIm + cIm;
		zRe = temp;
	}

	const int r = it==MAX_ITER?0:it;
	const int g = r;
	const int b = r;
	pixelsOut[x + y*MB_WIDTH] = b | (g << 8) | (r << 16) | 0xff000000;
}

int main(int argc, char *argv[]) {
	using namespace std;
	using namespace chrono;
	using theclock = high_resolution_clock;

	theclock::time_point lastTime;
	theclock::time_point startTime;
	int lastFrameCount = 0;

	uint32_t *mbPixels;
	cudaMallocHost(&mbPixels, MB_WIDTH*MB_HEIGHT*4);
	while (true) {
		const auto tStartRender = theclock::now();

		mandelbrotK<<<dim3(MB_WIDTH/MB_BLOCK_WIDTH, MB_HEIGHT/MB_BLOCK_HEIGHT), dim3(MB_BLOCK_WIDTH, MB_BLOCK_HEIGHT)>>>(
			mbPixels, -0.5, 0, 2
		);

		const auto tStartSync = theclock::now();

		cudaDeviceSynchronize();

		const auto tDone = theclock::now();

	#ifdef SAVE_TO_DISK
		cairo_surface_t *surface = cairo_image_surface_create_for_data((unsigned char*)mbPixels, CAIRO_FORMAT_ARGB32, MB_WIDTH, MB_HEIGHT, MB_WIDTH*4);
		cairo_surface_write_to_png(surface, "/home/nvidia/Documents/mandelbrot.png");
		cairo_surface_destroy(surface);
		return 0;
	#endif

		theclock::time_point curTime = theclock::now();
		if (!lastTime.time_since_epoch().count()) {
			lastTime = curTime;
			startTime = curTime;
		} else {
			++lastFrameCount;
			const double timeDiffLast = duration_cast<duration<double>>(curTime-lastTime).count();
			if (timeDiffLast > 1) {
				const double timeDiffStart = duration_cast<duration<double>>(curTime-startTime).count();

				const double kernelDuration = duration_cast<duration<double, ratio<1, 1000>>>(tStartSync-tStartRender).count();
				const double syncDuration = duration_cast<duration<double, ratio<1, 1000>>>(tDone-tStartSync).count();
				const double renderDuration = duration_cast<duration<double, ratio<1, 1000>>>(tDone-tStartRender).count();

				printf("%.0fs, %.1f fps (kernel: %.3fms + sync: %.3fms = %.3fms)",
						timeDiffStart,
						lastFrameCount/timeDiffLast,
						kernelDuration,
						syncDuration,
						renderDuration);
				cout << endl;
				lastTime = curTime;
				lastFrameCount = 0;
			}
		}
	}
	return 0;
}

Hi,

For maximize the performance on tx2, please:

  1. Set nvpmodel to max-N
  2. Run jeston_clock.sh

Hi, your solution works perfectly!
But just out of curiosity: With non-identical min- and max-frequencies set (not executing jetson_clocks.sh), how does the system decide on what exact frequency to use? If I set for example “nvpmodel -m 0”, which has the highest max. gpu-frequency, the actual frequency chosen is low and my test-application runs very slowly (6.7fps, 150ms rendering time/frame). When does the system choose the max. frequency?