cuFFT & cuda-x86

Hello,

I have a very simple cuda code doing a FFT using cuFFT. So I tried to test your solution named : cuda-x86 with it.

The souce code is :

#include <assert>
#include <stdio>
#include <stdlib>
#include <string>
#include <cufft>
#include <cuda>
#include <cuda_runtime>
#include <time>
#include <iostream>


#define NB_ITER 1000

#define DATA_W 512
#define DATA_H 512


////////////////////////////////////////////////////////////////////////////////
// For data
////////////////////////////////////////////////////////////////////////////////

float getRand(void){
    return (float)(rand()/float(RAND_MAX)+1);
}

bool test2(void){
    
//////////////////////////////
//  Declaration
/////////////////////////////    

    float *h_data;
    
    float *d_data;
	
    cufftComplex*
        *d_DataSpectrum0;

    cufftHandle
        fftPlan1;
        
    cudaStream_t stream2, stream1, stream3;
    cudaStreamCreate(&stream1);
    cudaStreamCreate(&stream2);
    cudaStreamCreate(&stream3);

//////////////////////////////
//  Allocation
/////////////////////////////
    cudaMallocHost((void**)&h_data , DATA_H * DATA_W * sizeof(float)) ;
        
    cudaMalloc((void **)&d_data, DATA_H * DATA_W * sizeof(float)) ;
    cudaMalloc((void **)&d_DataSpectrum0,   DATA_H * DATA_W  * sizeof(cufftComplex)) ;

//////////////////////////////
//  Data
/////////////////////////////
	srand(2010);
	for(int i = 0; i < DATA_H * DATA_W; i++)
		h_data[i] = getRand();

        
	cufftPlan2d(&fftPlan1, DATA_H, DATA_W, CUFFT_R2C);
	cufftSetStream(fftPlan1, stream2);
   
//////////////////////////////
//  copies & compute
/////////////////////////////
    cudaMemcpyAsync(d_data, h_data, DATA_H * DATA_W * sizeof(float), cudaMemcpyHostToDevice, 0) ;
    cudaDeviceSynchronize();

    struct timespec tBegin;
    struct timespec tEnd;

    clock_gettime(CLOCK_REALTIME, &tBegin );

    for (int i = 0 ; i < NB_ITER ; i++)
    {
   		if ( cufftExecR2C(fftPlan1, (cufftReal*)d_data, (cufftComplex*)d_DataSpectrum0) == CUFFT_SUCCESS )
   			cudaDeviceSynchronize();
   		else
   			std::cout << "error" << std::endl;
    }

    clock_gettime(CLOCK_REALTIME, &tEnd);

    double Ttime = (( tEnd.tv_sec *1e6 + ((float) tEnd.tv_nsec / 1000.0)) - (tBegin.tv_sec *1e6 + ((float)tBegin.tv_nsec / 1000.0))) / 1000.0 ;
    std::cout << "- cuFFT: " << Ttime/(float)NB_ITER << " ms " << NB_ITER << " on iterations" << std::endl;




//////////////////////////////
//  Desallocation
/////////////////////////////
	cufftDestroy(fftPlan1);
	cudaFree(d_DataSpectrum0);
	cudaFreeHost(h_data);
	cudaFree(d_data);

	cudaStreamDestroy(stream1);
	cudaStreamDestroy(stream2);
	cudaStreamDestroy(stream3);
		
	return true;
}

int main(int argc, char **argv)
{
    int dev;
    cudaGetDevice(&dev);
    cudaSetDevice(dev);

    test2();

    cudaDeviceReset();
}

I have two problems :
1 - using cuda 4.1, the nvidia headers are not compatible with your compiler
2 - using cuda 4.0, I have a segmentation fault

For compiling I used these options :

#compiler
PGI	:= /opt/pgi/linux86-64/2012/bin/pgc++

#include
INCCUDA = -I /opt/pgi/linux86-64/2012/cuda/4.0/include

#libraries
  LIB = -lm -lrt
  LIBCUDA = -L /opt/pgi/linux86-64/2012/cuda/4.0/lib64 -lcufft

#flags
  FLAGS = --no_using_std -Mcudax86 -DUNIX -O2

My platform is : Ubuntu 11.04 64bits
My processor : Intel i7 2600K
My version of PGI : 12.3

Of course, this code works well on Nvidia GPU cards.
Please, can you help me to solve this problem ? Thank you.

Hi MathieuT,

All CUDA code, including libraries, need to be compiled with CUDA-x86 before it can be used. Unfortunately I don’t think NVIDIA has released a version of cuFFT that will work with CUDA-x86. Instead, you’ll need to use conditional compilation to call a x86 FFT library, such as FFTW.

1 - using cuda 4.1, the nvidia headers are not compatible with your compiler

Correct. CUDA 4.1 support has not yet been added.

Best Regards,
Mat