I propose two solutions to your problem and I recommend the first one if there is no good reason to choose the second one.
The first one uses a cufftComplex type already on the host. Then copying to device is easy and you will also easily understand what you are doing…
Solution 1:
#include <cufft.h>
#include <stdio.h>
int main()
{
int N=10;
cufftComplex* data;
data = (cufftComplex *) malloc ( sizeof(cufftComplex) * N ) ;
cufftComplex* dData;
cudaMalloc((void **) &dData, sizeof(cufftComplex) * N);
if (cudaGetLastError() != cudaSuccess)
{
fprintf(stderr, "Cuda error: Failed to allocate\n");
return -1;
}
for(int ii=0; ii < N ; ii++)
{
data[ii].x= sinpi( .9*(float)ii/(float)N);
data[ii].y= cospi( (float)ii/(float)N);
}
printf( "Org vals: \n");
for(int ii=0; ii<N ; ii++)
{
printf ( "%f+i*%f\n", data[ii].x,data[ii].y );
}
cudaMemcpy( dData, data, sizeof(cufftComplex)*N, cudaMemcpyHostToDevice );
cufftHandle plan;
if (cufftPlan1d(&plan, N, CUFFT_C2C, 1) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Plan creation failed");
return -1;
}
if (cufftExecC2C(plan, dData, dData, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
return -1;
}
if (cudaThreadSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
return -1;
}
cudaMemcpy( data, dData, sizeof(cufftComplex)*N, cudaMemcpyDeviceToHost );
printf( "fft vals: \n");
for(int ii=0; ii<N ; ii++)
{
printf ( "%f+i*%f\n", data[ii].x,data[ii].y );
}
}
The second solution uses your original code and fixes your memcpys. I do not recommend to use this because for me it looks more like a hack… I like code that is easy to understand…
Solution 2:
#include <cufft.h>
#include <stdio.h>
int main() {
int N = 10;
float * real_vec; // host vector, real part
float * imag_vec; // host vector, imaginary part
float * resultReal;
real_vec = (float *) malloc(sizeof(float) * N);
imag_vec = (float *) malloc(sizeof(float) * N);
resultReal = (float *) malloc(sizeof(float) * N);
for (int ii = 0; ii < N; ii++)
{
real_vec[ii] = sinpi( .9*(float)ii/(float)N);
imag_vec[ii] = cospi( (float)ii/(float)N);
}
float2 * complex_vec_d; // device vector, single-precision complex
cudaMalloc((void **) &complex_vec_d, sizeof(float2) * N);
if (cudaGetLastError() != cudaSuccess)
{
fprintf(stderr, "Cuda error: Failed to allocate\n");
return -1;
}
cudaMemcpy2D(complex_vec_d, 2 * sizeof(float), real_vec, 1 * sizeof(float), sizeof(float), N, cudaMemcpyHostToDevice);
cudaMemcpy2D(&complex_vec_d[0].y, 2 * sizeof(float), imag_vec, 1 * sizeof(float), sizeof(float), N, cudaMemcpyHostToDevice);
cufftHandle plan;
if (cufftPlan1d(&plan, N, CUFFT_C2C, 1) != CUFFT_SUCCESS) {
fprintf(stderr, "CUFFT error: Plan creation failed");
return -1;
}
if (cufftExecC2C(plan, complex_vec_d, complex_vec_d, CUFFT_FORWARD)
!= CUFFT_SUCCESS)
{
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
return -1;
}
if (cudaThreadSynchronize() != cudaSuccess)
{
fprintf(stderr, "Cuda error: Failed to synchronize\n");
return -1;
}
cudaMemcpy2D(resultReal, 1 * sizeof(float), complex_vec_d,
2 * sizeof(float), sizeof(float), N,
cudaMemcpyDeviceToHost);
printf("fftvals:\n");
for (int ii = 0; ii < N; ii++)
{
printf("%f\n", resultReal[ii]);
}
}
Basically it fixes the cudaMemcpy2D operations:
- “complex_vec_d + 1” points to the next float2 not the imaginary part…
- you pass the size of pointers where you should pass the size of floats