CPU usage at 99% while kernel is running

Hi,

my code basically loops through <allocate gpu buffer, copy from host to device, execute kernel, copy back, free buffer>. i’ve noticed that even with cuda 2.0 I get 99% cpu consumption while my program is running - most of this time is spend in kernel mode. I don’t think this is caused by the loop itself as the kernel is only called about once or twice per second.

can anyone give me some insight on that? cuda seems to burn cpu cycles in kernel mode while waiting for the kernel to finish.

More precisely, it burns CPU cycles when a cudaThreadSynchronize() or other implicit sync (such as a device->host cudaMemcpy) occurs. This has been the case since CUDA 0.8.

From what I can tell cudaThreadSynchronize (and stream sync) poll continuously on the device.

I’ve hacked around it in some cases where I didn’t need the host CPU or the thread to do anything but control CUDA but be nice to the cpu with the event system.

Basically you construct a while(cudaEventQuery(evt) == cudaErrorNotReady) { usleep(100); }

Its not beautiful, its not the best way to do it, but it’s the only thing I’ve found short of directly interfacing the driver’s poll interface.

-Patrick

Thanks for the advice. Indeed it seems that the cudaMemcpy() call after the kernel execution implied the driver to poll the device. I now do

The event recording after the kernel call will only succeed after the kernel has finished. The CPU-consumption has now dropped from 99% to 5%.

This enables me to run a cpu-bound execution path in parallel to the gpu-bound. Muhahahah

I have tried doing this with the cufft but have had no luck. Does anyone have a snippet of code that demonstrates how to do the fft asynchronously?

Thanks!

Here is the code I am running. You will see it doesnt give the correct results. What is wrong with it?

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cufft.h>

#include <cutil.h>

// includes, special

#include <ctype.h>

#include <stdio.h>

#include <stdlib.h>

#include <unistd.h>

// Complex data type

typedef float2 Complex;

int createArray(Complex *arrayPtr, int arrayLength);

int main(int argc, char** argv)

{

  int fftSize = 1*1024;

  int memSizeHost = 512*(1024*1024);

  Complex* hostMem = 0;

  Complex* deviceMem = 0;

  int batchSize = 1;

 CUT_DEVICE_INIT(1, "");

 // Allocate memory on GPU and RAM

  memSizeHost = (memSizeHost / (fftSize*sizeof(Complex))) * (fftSize*sizeof(Complex));

  int memSizeDevice = sizeof(Complex) * fftSize * batchSize;

 // Allocate memory in RAM

  CUDA_SAFE_CALL(cudaMallocHost((void**)&hostMem, memSizeHost));

  CUDA_SAFE_CALL(cudaMalloc((void **) &deviceMem, memSizeDevice));   // Allocate array on device

  // Initialize GPU memory to nan and host memory to zero

  CUDA_SAFE_CALL(cudaMemset(deviceMem, 255, memSizeDevice));

  memset(hostMem, 0, memSizeHost);

 // Ensure memory allocation functions worked correctly.

  if(hostMem == NULL)

  {

    printf("Error -- Failed to allocate host memory!\n");

    exit(EXIT_FAILURE);

  }

 if(deviceMem == NULL)

  {

    printf("Error -- Failed to allocate device memory!\n");

    exit(EXIT_FAILURE);

  }

 // Create input data array.

  createArray(hostMem, memSizeHost/sizeof(Complex));

 // Create FFT plan

  cufftHandle plan;

  CUFFT_SAFE_CALL(cufftPlan1d(&plan, fftSize, CUFFT_C2C, batchSize));

 int ioi = 13;

  int idx = 0;

  if (0)

  {

    // Synchronous Processing

    CUDA_SAFE_CALL(cudaMemcpy(deviceMem, &(hostMem[idx]), memSizeDevice, cudaMemcpyHostToDevice));

    CUFFT_SAFE_CALL(cufftExecC2C(plan, (cufftComplex *)deviceMem, (cufftComplex *)deviceMem, CUFFT_FORWARD));

    // This modifies some of the host memory.  If we see it after copying back the data

    //   from the GPU to the host machine, the copy did not work.

    hostMem[ioi].x = -131313;

    CUDA_SAFE_CALL(cudaMemcpy(&(hostMem[idx]), deviceMem, memSizeDevice, cudaMemcpyDeviceToHost));

  }

  else

  {

    // Asychronous Processing

    cudaEvent_t event;

    cudaEventCreate(&event);

    cudaEventRecord(event, 0);

    cudaMemcpyAsync(deviceMem, &(hostMem[idx]), memSizeDevice, cudaMemcpyHostToDevice, 0);

    cufftExecC2C(plan, (cufftComplex *)deviceMem, (cufftComplex *)deviceMem, CUFFT_FORWARD);

   // This modifies some of the host memory.  If we see it after copying back the data

    //   from the GPU to the host machine, the copy did not work.

    hostMem[ioi].x = -131313;

   cudaMemcpyAsync(&(hostMem[idx]), deviceMem, memSizeDevice, cudaMemcpyHostToDevice, 0);

    cudaEventRecord(event, 0);

    //cudaThreadSynchronize();

    //cudaEventSynchronize(event);

    while (cudaEventQuery(event) == cudaErrorNotReady)

    {

      printf("In while loop.\n");

      usleep(1000);

    }

    CUDA_SAFE_CALL(cudaEventDestroy(event));

  }

 // Display result

  printf("Result: %d = %f +%fi\n", ioi, hostMem[ioi].x, hostMem[ioi].y);

  printf("-----------------------------\n");

  printf("If test passed, result should be: 13 = -13342.587891 +12318.576172i\n");

  printf("If FFT and GPU memcpy did not occur, result should be: Result: 13 = -131313.000000 +13.000000i\n");

 // Free memory

  cudaFreeHost(hostMem);

  cudaFree(deviceMem);

 return EXIT_SUCCESS;

}

int createArray(Complex *arrayPtr, int arrayLength)

{

  unsigned int a;

  int status;

  float timeVal;

  timeVal = time(NULL);

  srand(timeVal);

  //printf("timeVal = %f\n", timeVal);

 for(a = 0;a < arrayLength;++a)

  {

    arrayPtr[a].x = a;

    arrayPtr[a].y = a;

    //printf("%d = %f +%fi\n", a, arrayPtr[a].x, arrayPtr[a].y);

  }

  status = 1;

  //printf("%d = %f +%fi\n", 3, arrayPtr[3].x, arrayPtr[3].y);

 return status;

}

When I run it, I get the following output:

[root@localhost CudaFFTBenchmark]# asyncFftTest

Using device 0: GeForce GTX 280

In while loop.

Result: 13 = -131313.000000 +13.000000i

-----------------------------

If test passed, result should be: 13 = -13342.587891 +12318.576172i

If FFT and GPU memcpy did not occur, result should be: Result: 13 = -131313.000000 +13.000000i