cufftExecC2C and cudaMemcpyAsync Doing the FFT without 100% CPU usage

isaacgerg · September 3, 2008, 3:09pm

Hi everyone,

I would like to perform 1D C2C FFTs without causing the CPU utilization to go to 100%. I have seen many forum posts about using cudaMemcpyAsync and to look at the asyncAPI example. However, I have tried the recommendations that all of these posts talk about. None of them work. And yes, I am using pinned memory via cudaMallocHost().

Would someone be willing to please post some code that does the following and does not eat up 100% CPU:

Copy data from host memory (CPU) to GPU
FFT the data via the GPU
Copy result back to the host memory (CPU)

From all the postings I have seen, something like the following should work, but doesnt:

cudaEventCreate(evt);

cudaAsyncMemcpy(device, host, size, 0);

cufftExecC2c(device, plan, etc..)

cudaAsyncMemcpy(host, device, size, 0);

while(cudaQueryEvent(evt) == cudaErrorNotRead)

{

   usleep(1000);

}

I have tried several variations of this covered in the many posts on the topic. I have tried appending each step (of the 3 I mentioned above) with cudaThreadSync() stuff. I have also tried to wrap each step with sleeps and had no luck… see below:

cudaEventCreate(evt);

cudaAsyncMemcpy(device, host, size, 0);

while(cudaQueryEvent(evt) == cudaErrorNotRead)

{

   usleep(1000);

}

cudaEventCreate(evt2);

cufftExecC2c(device, plan, etc..)

while(cudaQueryEvent(evt2) == cudaErrorNotRead)

{

   usleep(1000);

}

cudaEventCreate(evt3)

cudaAsyncMemcpy(host, device, size, 0);

while(cudaQueryEvent(evt3) == cudaErrorNotRead)

{

   usleep(1000);

}

Please help!

mfatica · September 3, 2008, 3:18pm

The 100% CPU utilization is triggered by the copy back ( CPU goes in a spin wait).
You can avoid it delaying the call to the copy back.

copy(device,host)
FFTonGPU
do_something_else_on_CPU(or just sleep)
copy(host,device)

isaacgerg · September 3, 2008, 3:27pm

Why does the CPU have to go into a spin wait? Why cant I asynchronously copy the data back to the CPU so my thread yeilds to other processes in the system?

Additionally, this answer does not solve my problem. When I use the pseudo code mentioned above, I dont see the correct FFT results in the host memory (regardless of the fact it might exhibit 100% utilization). I only get the correct answer when I do synchronous memcps and wrap the FFT in a safe call macro. See code following:

CUDA_SAFE_CALL(cudaMemcpy(deviceMem, &(hostMem[idx]), memSizeDevice, cudaMemcpyHostToDevice));

CUFFT_SAFE_CALL(cufftExecC2C(plan, (cufftComplex *)deviceMem, (cufftComplex *)deviceMem, CUFFT_FORWARD));

CUDA_SAFE_CALL(cudaMemcpy(&(hostMem[idx]), deviceMem, memSizeDevice, cudaMemcpyDeviceToHost));

mfatica · September 3, 2008, 4:07pm

It goes in spin wait to reduce latency.

CUDA_SAFE_CALL macros are completely useless in release mode, you have a bug somewhere else. You should see the correct result in host memory after the cudamemcpy.

isaacgerg · September 3, 2008, 5:43pm

How do I know I am in release mode?

Here is the code I am running. You will see it doesnt give the correct results. What is wrong with it?

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cufft.h>

#include <cutil.h>

// includes, special

#include <ctype.h>

#include <stdio.h>

#include <stdlib.h>

#include <unistd.h>

// Complex data type

typedef float2 Complex;

int createArray(Complex *arrayPtr, int arrayLength);

int main(int argc, char** argv)

{

  int fftSize = 1*1024;

  int memSizeHost = 512*(1024*1024);

  Complex* hostMem = 0;

  Complex* deviceMem = 0;

  int batchSize = 1;

 CUT_DEVICE_INIT(1, "");

 // Allocate memory on GPU and RAM

  memSizeHost = (memSizeHost / (fftSize*sizeof(Complex))) * (fftSize*sizeof(Complex));

  int memSizeDevice = sizeof(Complex) * fftSize * batchSize;

 // Allocate memory in RAM

  CUDA_SAFE_CALL(cudaMallocHost((void**)&hostMem, memSizeHost));

  CUDA_SAFE_CALL(cudaMalloc((void **) &deviceMem, memSizeDevice));   // Allocate array on device

  // Initialize GPU memory to nan and host memory to zero

  CUDA_SAFE_CALL(cudaMemset(deviceMem, 255, memSizeDevice));

  memset(hostMem, 0, memSizeHost);

 // Ensure memory allocation functions worked correctly.

  if(hostMem == NULL)

  {

    printf("Error -- Failed to allocate host memory!\n");

    exit(EXIT_FAILURE);

  }

 if(deviceMem == NULL)

  {

    printf("Error -- Failed to allocate device memory!\n");

    exit(EXIT_FAILURE);

  }

 // Create input data array.

  createArray(hostMem, memSizeHost/sizeof(Complex));

 // Create FFT plan

  cufftHandle plan;

  CUFFT_SAFE_CALL(cufftPlan1d(&plan, fftSize, CUFFT_C2C, batchSize));

 int ioi = 13;

  int idx = 0;

  if (0)

  {

    // Synchronous Processing

    CUDA_SAFE_CALL(cudaMemcpy(deviceMem, &(hostMem[idx]), memSizeDevice, cudaMemcpyHostToDevice));

    CUFFT_SAFE_CALL(cufftExecC2C(plan, (cufftComplex *)deviceMem, (cufftComplex *)deviceMem, CUFFT_FORWARD));

    // This modifies some of the host memory.  If we see it after copying back the data

    //   from the GPU to the host machine, the copy did not work.

    hostMem[ioi].x = -131313;

    CUDA_SAFE_CALL(cudaMemcpy(&(hostMem[idx]), deviceMem, memSizeDevice, cudaMemcpyDeviceToHost));

  }

  else

  {

    // Asychronous Processing

    cudaEvent_t event;

    cudaEventCreate(&event);

    cudaEventRecord(event, 0);

    cudaMemcpyAsync(deviceMem, &(hostMem[idx]), memSizeDevice, cudaMemcpyHostToDevice, 0);

    cufftExecC2C(plan, (cufftComplex *)deviceMem, (cufftComplex *)deviceMem, CUFFT_FORWARD);

   // This modifies some of the host memory.  If we see it after copying back the data

    //   from the GPU to the host machine, the copy did not work.

    hostMem[ioi].x = -131313;

   cudaMemcpyAsync(&(hostMem[idx]), deviceMem, memSizeDevice, cudaMemcpyHostToDevice, 0);

    cudaEventRecord(event, 0);

    //cudaThreadSynchronize();

    //cudaEventSynchronize(event);

    while (cudaEventQuery(event) == cudaErrorNotReady)

    {

      printf("In while loop.\n");

      usleep(1000);

    }

    CUDA_SAFE_CALL(cudaEventDestroy(event));

  }

 // Display result

  printf("Result: %d = %f +%fi\n", ioi, hostMem[ioi].x, hostMem[ioi].y);

  printf("-----------------------------\n");

  printf("If test passed, result should be: 13 = -13342.587891 +12318.576172i\n");

  printf("If FFT and GPU memcpy did not occur, result should be: Result: 13 = -131313.000000 +13.000000i\n");

 // Free memory

  cudaFreeHost(hostMem);

  cudaFree(deviceMem);

 return EXIT_SUCCESS;

}

int createArray(Complex *arrayPtr, int arrayLength)

{

  unsigned int a;

  int status;

  float timeVal;

  timeVal = time(NULL);

  srand(timeVal);

  //printf("timeVal = %f\n", timeVal);

 for(a = 0;a < arrayLength;++a)

  {

    arrayPtr[a].x = a;

    arrayPtr[a].y = a;

    //printf("%d = %f +%fi\n", a, arrayPtr[a].x, arrayPtr[a].y);

  }

  status = 1;

  //printf("%d = %f +%fi\n", 3, arrayPtr[3].x, arrayPtr[3].y);

 return status;

}

When I run it, I get the following output:

[root@localhost CudaFFTBenchmark]# asyncFftTest

Using device 0: GeForce GTX 280

In while loop.

Result: 13 = -131313.000000 +13.000000i

-----------------------------

If test passed, result should be: 13 = -13342.587891 +12318.576172i

If FFT and GPU memcpy did not occur, result should be: Result: 13 = -131313.000000 +13.000000i

isaacgerg · September 3, 2008, 6:29pm

See above post.

bdg146psu · September 5, 2008, 6:15pm

Found the mistake, the second async memcpy should not be:
cudaMemcpyHostToDevice

but should be
cudaMemcpyDeviceToHost

Whoops! Sorry!

Topic		Replies	Views
CPU usage at 99% while kernel is running CUDA Programming and Performance	5	4458	September 3, 2008
Multiple batches of 1D FFT using cuFFT GPU-Accelerated Libraries	10	4950	October 29, 2019
cuFFT 2^15+ issues? GPU-Accelerated Libraries	3	1803	January 2, 2013
Got out of memory from cudaMemcpy CUDA Programming and Performance	13	3870	January 28, 2022
cuFFT cufftPlan1d and cufftExecR2C issues GPU-Accelerated Libraries	4	2359	July 13, 2016
bets way to return a float value sync or assync CUDA Programming and Performance	26	10310	May 7, 2009
Memory leak in cuFFT (cuda 5.0)? GPU-Accelerated Libraries	8	3578	January 27, 2013
cufft doubt comparing r2c and c2c 2D FFTs CUDA Programming and Performance	28	13479	October 27, 2010
cufftPlan2d fails CUDA Programming and Performance	14	21012	September 17, 2007
Simple CUFFT cuda code not working... CUDA Programming and Performance	1	4264	September 9, 2009

cufftExecC2C and cudaMemcpyAsync Doing the FFT without 100% CPU usage

Related topics