Hello,
I have a very simple cuda code doing a FFT using cuFFT. So I tried to test your solution named : cuda-x86 with it.
The souce code is :
#include <assert>
#include <stdio>
#include <stdlib>
#include <string>
#include <cufft>
#include <cuda>
#include <cuda_runtime>
#include <time>
#include <iostream>
#define NB_ITER 1000
#define DATA_W 512
#define DATA_H 512
////////////////////////////////////////////////////////////////////////////////
// For data
////////////////////////////////////////////////////////////////////////////////
float getRand(void){
return (float)(rand()/float(RAND_MAX)+1);
}
bool test2(void){
//////////////////////////////
// Declaration
/////////////////////////////
float *h_data;
float *d_data;
cufftComplex*
*d_DataSpectrum0;
cufftHandle
fftPlan1;
cudaStream_t stream2, stream1, stream3;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaStreamCreate(&stream3);
//////////////////////////////
// Allocation
/////////////////////////////
cudaMallocHost((void**)&h_data , DATA_H * DATA_W * sizeof(float)) ;
cudaMalloc((void **)&d_data, DATA_H * DATA_W * sizeof(float)) ;
cudaMalloc((void **)&d_DataSpectrum0, DATA_H * DATA_W * sizeof(cufftComplex)) ;
//////////////////////////////
// Data
/////////////////////////////
srand(2010);
for(int i = 0; i < DATA_H * DATA_W; i++)
h_data[i] = getRand();
cufftPlan2d(&fftPlan1, DATA_H, DATA_W, CUFFT_R2C);
cufftSetStream(fftPlan1, stream2);
//////////////////////////////
// copies & compute
/////////////////////////////
cudaMemcpyAsync(d_data, h_data, DATA_H * DATA_W * sizeof(float), cudaMemcpyHostToDevice, 0) ;
cudaDeviceSynchronize();
struct timespec tBegin;
struct timespec tEnd;
clock_gettime(CLOCK_REALTIME, &tBegin );
for (int i = 0 ; i < NB_ITER ; i++)
{
if ( cufftExecR2C(fftPlan1, (cufftReal*)d_data, (cufftComplex*)d_DataSpectrum0) == CUFFT_SUCCESS )
cudaDeviceSynchronize();
else
std::cout << "error" << std::endl;
}
clock_gettime(CLOCK_REALTIME, &tEnd);
double Ttime = (( tEnd.tv_sec *1e6 + ((float) tEnd.tv_nsec / 1000.0)) - (tBegin.tv_sec *1e6 + ((float)tBegin.tv_nsec / 1000.0))) / 1000.0 ;
std::cout << "- cuFFT: " << Ttime/(float)NB_ITER << " ms " << NB_ITER << " on iterations" << std::endl;
//////////////////////////////
// Desallocation
/////////////////////////////
cufftDestroy(fftPlan1);
cudaFree(d_DataSpectrum0);
cudaFreeHost(h_data);
cudaFree(d_data);
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
cudaStreamDestroy(stream3);
return true;
}
int main(int argc, char **argv)
{
int dev;
cudaGetDevice(&dev);
cudaSetDevice(dev);
test2();
cudaDeviceReset();
}
I have two problems :
1 - using cuda 4.1, the nvidia headers are not compatible with your compiler
2 - using cuda 4.0, I have a segmentation fault
For compiling I used these options :
#compiler
PGI := /opt/pgi/linux86-64/2012/bin/pgc++
#include
INCCUDA = -I /opt/pgi/linux86-64/2012/cuda/4.0/include
#libraries
LIB = -lm -lrt
LIBCUDA = -L /opt/pgi/linux86-64/2012/cuda/4.0/lib64 -lcufft
#flags
FLAGS = --no_using_std -Mcudax86 -DUNIX -O2
My platform is : Ubuntu 11.04 64bits
My processor : Intel i7 2600K
My version of PGI : 12.3
Of course, this code works well on Nvidia GPU cards.
Please, can you help me to solve this problem ? Thank you.