Hi,
I am trying to do complex conjugate in cuda. i am using geforce 9600gt, cuda 2.0, and matlab 2007a. i take simpleCUFFT as a reference. no error message, but the results is not correct. seems that it won’t process the kernel. any help??
this is my code:
[codebox]#include “cufft.h”
#include “cuda.h”
#include “mex.h”
#include “cuda_runtime_api.h”
#include “cutil.h”
typedef float2 Complex;
cufftComplex *fft_conj(Complex *, Complex *, int , int );
static global void ComplexConj(Complex* a, const Complex* b, int size)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = threadID; i < size; i += numThreads)
{
a[i].x = b[i].x;
a[i].y = b[i].y * (-1.0);
}
}
void pack_r2c(Complex *output_float,
double *input_re,
int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_float[i].x = input_re[i];
output_float[i].y = 0.0f;
}
}
void pack_c2c(Complex *output_float,
double *input_re,
double *input_im,
int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_float[i].x = input_re[i];
output_float[i].y = input_im[i];
}
}
void unpack_c2c(Complex *input_float,
double *output_re,
double *output_im,
int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_re[i] = input_float[i].x;
output_im[i] = input_float[i].y;
}
}
void mexFunction(int nlhs, mxArray *plhs, int nrhs, const mxArray *prhs)
{
double *reffR, *reffI, *currR, *currI, *outR, *outI;
int m, n;
cufftComplex *data1, *data2;
if( nrhs < 1 ) mexErrMsgTxt( "Input argument not defined." );
m = mxGetM(prhs[0]);
n = mxGetN(prhs[0]);
/* Allocating host memory. */
data1 = (Complex *)mxMalloc(sizeof(Complex) * n * m);
data2 = (Complex *)mxMalloc(sizeof(Complex) * n * m);
reffR = mxGetPr(prhs[0]);
currR = mxGetPr(prhs[1]);
if (mxIsComplex(prhs[0]))
{
reffI = mxGetPi(prhs[0]);
currI = mxGetPi(prhs[1]);
pack_c2c(data1, reffR, reffI, m*n);
pack_c2c(data2, currR, currI, m*n);
} else
{
pack_r2c(data1, reffR, m*n);
pack_r2c(data2, currR, m*n);
}
data1 = fft_conj(data1, data2, m, n);
plhs[0] = mxCreateDoubleMatrix(m, n, mxCOMPLEX);
outR = mxGetPr(plhs[0]);
outI = mxGetPi(plhs[0]);
unpack_c2c(data1, outR, outI, n*m);
mxFree(data1);
return;
}
cufftComplex *fft_conj(Complex *data1, Complex *data2, int m, int n)
{
// Allocate device memory for data
Complex *d_data1;
Complex *d_data2;
Complex *d_conjugate;
int memSize = sizeof(Complex) * m * n ;
cudaMalloc( (void **)&d_data1, memSize );
cudaMalloc( (void **)&d_data2, memSize );
cudaMalloc( (void **)&d_conjugate, memSize );
// Copy host memory to device
cudaMemcpy(d_data1, data1, memSize, cudaMemcpyHostToDevice);
cudaMemcpy(d_data2, data2, memSize, cudaMemcpyHostToDevice);
// CUFFT plan
cufftHandle plan1;
cufftHandle plan2;
cufftPlan2d(&plan1, n, m, CUFFT_C2C);
cufftPlan2d(&plan2, n, m, CUFFT_C2C);
//cufftPlan2d(&plan3, n, m, CUFFT_C2C);
// FFT execution
cufftExecC2C(plan1, (cufftComplex *)d_data1, (cufftComplex *)d_data1, CUFFT_FORWARD);
cufftExecC2C(plan2, (cufftComplex *)d_data2, (cufftComplex *)d_data2, CUFFT_FORWARD);
dim3 dimBlock(n,m);
dim3 dimGrid ((n/dimBlock.x) + (!(n%dimBlock.x)?0:1) ,
(n/dimBlock.y) + (!(n%dimBlock.y)?0:1));
ComplexConj<<<dimGrid,dimBlock>>>(d_conjugate,d_data1,memSize);
CUT_CHECK_ERROR(“Kernel execution failed [ ComplexConj ]”);
// IFFT execution
//cufftExecC2C(plan3, (cufftComplex *)d_conjugate, (cufftComplex *)d_conjugate, CUFFT_INVERSE);
// Copy result to host
cudaMemcpy(data1, d_conjugate, memSize, cudaMemcpyDeviceToHost);
// Clear device memory
cufftDestroy(plan1);
cufftDestroy(plan2);
//cufftDestroy(plan3);
cudaFree(d_data1);
cudaFree(d_data2);
cudaFree(d_conjugate);
return data1;
}
[/codebox]