cufftExecC2C and cudaMemcpyAsync Doing the FFT without 100% CPU usage

Hi everyone,

I would like to perform 1D C2C FFTs without causing the CPU utilization to go to 100%. I have seen many forum posts about using cudaMemcpyAsync and to look at the asyncAPI example. However, I have tried the recommendations that all of these posts talk about. None of them work. And yes, I am using pinned memory via cudaMallocHost().

Would someone be willing to please post some code that does the following and does not eat up 100% CPU:

  1. Copy data from host memory (CPU) to GPU

  2. FFT the data via the GPU

  3. Copy result back to the host memory (CPU)

From all the postings I have seen, something like the following should work, but doesnt:

cudaEventCreate(evt);

cudaAsyncMemcpy(device, host, size, 0);

cufftExecC2c(device, plan, etc..)

cudaAsyncMemcpy(host, device, size, 0);

while(cudaQueryEvent(evt) == cudaErrorNotRead)

{

   usleep(1000);

}

I have tried several variations of this covered in the many posts on the topic. I have tried appending each step (of the 3 I mentioned above) with cudaThreadSync() stuff. I have also tried to wrap each step with sleeps and had no luck… see below:

cudaEventCreate(evt);

cudaAsyncMemcpy(device, host, size, 0);

while(cudaQueryEvent(evt) == cudaErrorNotRead)

{

   usleep(1000);

}

cudaEventCreate(evt2);

cufftExecC2c(device, plan, etc..)

while(cudaQueryEvent(evt2) == cudaErrorNotRead)

{

   usleep(1000);

}

cudaEventCreate(evt3)

cudaAsyncMemcpy(host, device, size, 0);

while(cudaQueryEvent(evt3) == cudaErrorNotRead)

{

   usleep(1000);

}

Please help!

The 100% CPU utilization is triggered by the copy back ( CPU goes in a spin wait).
You can avoid it delaying the call to the copy back.

copy(device,host)
FFTonGPU
do_something_else_on_CPU(or just sleep)
copy(host,device)

Why does the CPU have to go into a spin wait? Why cant I asynchronously copy the data back to the CPU so my thread yeilds to other processes in the system?

Additionally, this answer does not solve my problem. When I use the pseudo code mentioned above, I dont see the correct FFT results in the host memory (regardless of the fact it might exhibit 100% utilization). I only get the correct answer when I do synchronous memcps and wrap the FFT in a safe call macro. See code following:

CUDA_SAFE_CALL(cudaMemcpy(deviceMem, &(hostMem[idx]), memSizeDevice, cudaMemcpyHostToDevice));

CUFFT_SAFE_CALL(cufftExecC2C(plan, (cufftComplex *)deviceMem, (cufftComplex *)deviceMem, CUFFT_FORWARD));

CUDA_SAFE_CALL(cudaMemcpy(&(hostMem[idx]), deviceMem, memSizeDevice, cudaMemcpyDeviceToHost));

It goes in spin wait to reduce latency.

CUDA_SAFE_CALL macros are completely useless in release mode, you have a bug somewhere else. You should see the correct result in host memory after the cudamemcpy.

How do I know I am in release mode?

Here is the code I am running. You will see it doesnt give the correct results. What is wrong with it?

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cufft.h>

#include <cutil.h>

// includes, special

#include <ctype.h>

#include <stdio.h>

#include <stdlib.h>

#include <unistd.h>

// Complex data type

typedef float2 Complex;

int createArray(Complex *arrayPtr, int arrayLength);

int main(int argc, char** argv)

{

  int fftSize = 1*1024;

  int memSizeHost = 512*(1024*1024);

  Complex* hostMem = 0;

  Complex* deviceMem = 0;

  int batchSize = 1;

 CUT_DEVICE_INIT(1, "");

 // Allocate memory on GPU and RAM

  memSizeHost = (memSizeHost / (fftSize*sizeof(Complex))) * (fftSize*sizeof(Complex));

  int memSizeDevice = sizeof(Complex) * fftSize * batchSize;

 // Allocate memory in RAM

  CUDA_SAFE_CALL(cudaMallocHost((void**)&hostMem, memSizeHost));

  CUDA_SAFE_CALL(cudaMalloc((void **) &deviceMem, memSizeDevice));   // Allocate array on device

  // Initialize GPU memory to nan and host memory to zero

  CUDA_SAFE_CALL(cudaMemset(deviceMem, 255, memSizeDevice));

  memset(hostMem, 0, memSizeHost);

 // Ensure memory allocation functions worked correctly.

  if(hostMem == NULL)

  {

    printf("Error -- Failed to allocate host memory!\n");

    exit(EXIT_FAILURE);

  }

 if(deviceMem == NULL)

  {

    printf("Error -- Failed to allocate device memory!\n");

    exit(EXIT_FAILURE);

  }

 // Create input data array.

  createArray(hostMem, memSizeHost/sizeof(Complex));

 // Create FFT plan

  cufftHandle plan;

  CUFFT_SAFE_CALL(cufftPlan1d(&plan, fftSize, CUFFT_C2C, batchSize));

 int ioi = 13;

  int idx = 0;

  if (0)

  {

    // Synchronous Processing

    CUDA_SAFE_CALL(cudaMemcpy(deviceMem, &(hostMem[idx]), memSizeDevice, cudaMemcpyHostToDevice));

    CUFFT_SAFE_CALL(cufftExecC2C(plan, (cufftComplex *)deviceMem, (cufftComplex *)deviceMem, CUFFT_FORWARD));

    // This modifies some of the host memory.  If we see it after copying back the data

    //   from the GPU to the host machine, the copy did not work.

    hostMem[ioi].x = -131313;

    CUDA_SAFE_CALL(cudaMemcpy(&(hostMem[idx]), deviceMem, memSizeDevice, cudaMemcpyDeviceToHost));

  }

  else

  {

    // Asychronous Processing

    cudaEvent_t event;

    cudaEventCreate(&event);

    cudaEventRecord(event, 0);

    cudaMemcpyAsync(deviceMem, &(hostMem[idx]), memSizeDevice, cudaMemcpyHostToDevice, 0);

    cufftExecC2C(plan, (cufftComplex *)deviceMem, (cufftComplex *)deviceMem, CUFFT_FORWARD);

   // This modifies some of the host memory.  If we see it after copying back the data

    //   from the GPU to the host machine, the copy did not work.

    hostMem[ioi].x = -131313;

   cudaMemcpyAsync(&(hostMem[idx]), deviceMem, memSizeDevice, cudaMemcpyHostToDevice, 0);

    cudaEventRecord(event, 0);

    //cudaThreadSynchronize();

    //cudaEventSynchronize(event);

    while (cudaEventQuery(event) == cudaErrorNotReady)

    {

      printf("In while loop.\n");

      usleep(1000);

    }

    CUDA_SAFE_CALL(cudaEventDestroy(event));

  }

 // Display result

  printf("Result: %d = %f +%fi\n", ioi, hostMem[ioi].x, hostMem[ioi].y);

  printf("-----------------------------\n");

  printf("If test passed, result should be: 13 = -13342.587891 +12318.576172i\n");

  printf("If FFT and GPU memcpy did not occur, result should be: Result: 13 = -131313.000000 +13.000000i\n");

 // Free memory

  cudaFreeHost(hostMem);

  cudaFree(deviceMem);

 return EXIT_SUCCESS;

}

int createArray(Complex *arrayPtr, int arrayLength)

{

  unsigned int a;

  int status;

  float timeVal;

  timeVal = time(NULL);

  srand(timeVal);

  //printf("timeVal = %f\n", timeVal);

 for(a = 0;a < arrayLength;++a)

  {

    arrayPtr[a].x = a;

    arrayPtr[a].y = a;

    //printf("%d = %f +%fi\n", a, arrayPtr[a].x, arrayPtr[a].y);

  }

  status = 1;

  //printf("%d = %f +%fi\n", 3, arrayPtr[3].x, arrayPtr[3].y);

 return status;

}

When I run it, I get the following output:

[root@localhost CudaFFTBenchmark]# asyncFftTest

Using device 0: GeForce GTX 280

In while loop.

Result: 13 = -131313.000000 +13.000000i

-----------------------------

If test passed, result should be: 13 = -13342.587891 +12318.576172i

If FFT and GPU memcpy did not occur, result should be: Result: 13 = -131313.000000 +13.000000i

See above post.

Found the mistake, the second async memcpy should not be:
cudaMemcpyHostToDevice

but should be
cudaMemcpyDeviceToHost

Whoops! Sorry!