complex conjugate in mex

Hi,

I am trying to do complex conjugate in cuda. i am using geforce 9600gt, cuda 2.0, and matlab 2007a. i take simpleCUFFT as a reference. no error message, but the results is not correct. seems that it won’t process the kernel. any help??

this is my code:

[codebox]#include “cufft.h”

#include “cuda.h”

#include “mex.h”

#include “cuda_runtime_api.h”

#include “cutil.h”

typedef float2 Complex;

cufftComplex *fft_conj(Complex *, Complex *, int , int );

static global void ComplexConj(Complex* a, const Complex* b, int size)

{

const int numThreads = blockDim.x * gridDim.x;

const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

for (int i = threadID; i < size; i += numThreads)

{

    a[i].x = b[i].x;

    a[i].y = b[i].y * (-1.0);

}

}

void pack_r2c(Complex *output_float,

         double *input_re,

         int Ntot)

{

int i;

for (i = 0; i < Ntot; i++)

  {

          output_float[i].x = input_re[i];

          output_float[i].y = 0.0f;

  }

}

void pack_c2c(Complex *output_float,

         double *input_re,

         double *input_im,

         int Ntot)

{

int i;

for (i = 0; i < Ntot; i++)

 {

          output_float[i].x = input_re[i];

          output_float[i].y = input_im[i];

 }

}

void unpack_c2c(Complex *input_float,

           double *output_re,

           double *output_im,  

           int Ntot)

{

int i;

for (i = 0; i < Ntot; i++)

{

   output_re[i] = input_float[i].x;

   output_im[i] = input_float[i].y;

}

}

void mexFunction(int nlhs, mxArray *plhs, int nrhs, const mxArray *prhs)

{

double *reffR, *reffI, *currR, *currI, *outR, *outI;

int m, n;

cufftComplex *data1, *data2;



if( nrhs < 1 ) mexErrMsgTxt( "Input argument not defined." );

m = mxGetM(prhs[0]);

n = mxGetN(prhs[0]);



/* Allocating host memory. */

data1 = (Complex *)mxMalloc(sizeof(Complex) * n * m);

data2 = (Complex *)mxMalloc(sizeof(Complex) * n * m);

reffR = mxGetPr(prhs[0]);

currR = mxGetPr(prhs[1]);

if (mxIsComplex(prhs[0]))

{

	reffI = mxGetPi(prhs[0]);

	currI = mxGetPi(prhs[1]);

	pack_c2c(data1, reffR, reffI, m*n);

	pack_c2c(data2, currR, currI, m*n);

} else

{

	pack_r2c(data1, reffR, m*n);

	pack_r2c(data2, currR, m*n);

}

data1 = fft_conj(data1, data2, m, n);

plhs[0] = mxCreateDoubleMatrix(m, n, mxCOMPLEX);

outR = mxGetPr(plhs[0]);

outI = mxGetPi(plhs[0]);

unpack_c2c(data1, outR, outI, n*m);

mxFree(data1);

return;

}

cufftComplex *fft_conj(Complex *data1, Complex *data2, int m, int n)

{

// Allocate device memory for data

Complex *d_data1;

Complex *d_data2;

Complex *d_conjugate;

int memSize = sizeof(Complex) * m * n ;

cudaMalloc( (void **)&d_data1, memSize );

cudaMalloc( (void **)&d_data2, memSize );

cudaMalloc( (void **)&d_conjugate, memSize );

// Copy host memory to device

cudaMemcpy(d_data1, data1, memSize, cudaMemcpyHostToDevice);

cudaMemcpy(d_data2, data2, memSize, cudaMemcpyHostToDevice);

// CUFFT plan

cufftHandle plan1;

cufftHandle plan2;

cufftPlan2d(&plan1, n, m, CUFFT_C2C);

cufftPlan2d(&plan2, n, m, CUFFT_C2C);

//cufftPlan2d(&plan3, n, m, CUFFT_C2C);

// FFT execution

cufftExecC2C(plan1, (cufftComplex *)d_data1, (cufftComplex *)d_data1, CUFFT_FORWARD);

cufftExecC2C(plan2, (cufftComplex *)d_data2, (cufftComplex *)d_data2, CUFFT_FORWARD);

dim3 dimBlock(n,m);

dim3 dimGrid ((n/dimBlock.x) + (!(n%dimBlock.x)?0:1) ,

             (n/dimBlock.y) + (!(n%dimBlock.y)?0:1));

ComplexConj<<<dimGrid,dimBlock>>>(d_conjugate,d_data1,memSize);

CUT_CHECK_ERROR(“Kernel execution failed [ ComplexConj ]”);

// IFFT execution

//cufftExecC2C(plan3, (cufftComplex *)d_conjugate, (cufftComplex *)d_conjugate, CUFFT_INVERSE);

// Copy result to host

cudaMemcpy(data1, d_conjugate, memSize, cudaMemcpyDeviceToHost);

// Clear device memory

cufftDestroy(plan1);

cufftDestroy(plan2);

//cufftDestroy(plan3);

cudaFree(d_data1);

cudaFree(d_data2);

cudaFree(d_conjugate);

return data1;

}

[/codebox]

Problemo solved… :D