GPU processing does not give full power after hibernation

The GPU is put into hibernation state after GPU processing is repeated at a predetermined interval.
After restarting the process, the processing speed of the GPU significantly slows down.

This phenomenon did not occur in previous CUDA versions.
I would like to know how to make the processing always fast even with newer CUDA versions.

The phenomenon can be generated by repeating the following simple process.

  1. Sleep for 5 milliseconds every cycle.
  2. Perform appropriate GPU processing.
  3. Perform a 5 second sleep on the 1000th loop.
    *Repeat the above process.

“CUDA 7.0” and “CUDA 11.8” processing times were compared.
The graph is shown below.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <Windows.h>
#include <thread>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size, double &processTime);

__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
	for (int i = 0; i < 10000; i++) {
		c[i] = a[i] + b[i];
	}
}

int main()
{
	cudaError_t cudaStatus;
	{
		// warm up
		const int arraySize = 5;
		const int a[arraySize] = { 1, 2, 3, 4, 5 };
		const int b[arraySize] = { 10, 20, 30, 40, 50 };
		int c[arraySize] = { 0 };
		double processTime;
		cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize, processTime);
	}

	double processTime = 0;
	int index = 0;

	while (true) {

		timeBeginPeriod(1);
		Sleep(5);

		const int arraySize = 5;
		const int a[arraySize] = { 1, 2, 3, 4, 5 };
		const int b[arraySize] = { 10, 20, 30, 40, 50 };
		int c[arraySize] = { 0 };

		cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize, processTime);
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "addWithCuda failed!");
			return 1;
		}

		printf("loop count : %d / processTime : %lf \n", index, processTime);
		
		if (index == 999){
			index = 0;
			Sleep(5000);
		}
		else {
			index++;
		}
	}

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size, double &processTime)
{
	LARGE_INTEGER m_liFreq;
	LARGE_INTEGER m_start;
	QueryPerformanceFrequency(&m_liFreq);
	QueryPerformanceCounter(&m_start);

    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
	LARGE_INTEGER m_end;
	QueryPerformanceCounter(&m_end);
	processTime = 1000.0 * (double)(m_end.QuadPart - m_start.QuadPart) / m_liFreq.QuadPart;

    return cudaStatus;
}


It has already been reported a few times on these forums that putting the GPU to sleep will affect cuda processing speed immediately after the wakeup. That is known behavior. If you’d like attention on your specific case, I’d suggest filing a bug. I’m not saying its a defect or that anything can be done about it.

1 Like

Thank you.
This is the information I wanted to know.

I was under the impression that this is a spec behavior.

It seems to be fine during the first few calculations just after the process starts.

I may be able to control the CUDA runtime planning by devising a way to call the kernel.

I have a few ideas and will try them out.
Thank you so much for your help.