Doing FFT along rows

Hi,

I tried doing FFT on a 2D array along rows using cufftPlanMany(). The array is in column major order. The code and output is pasted next. Can someone help here and tell me what arguments do I pass to achieve this. You can assume a 4x4 matrix and hard code the values to be passed to cufftPlanMany(). I just want to understand how I could make this work. At this moment, cufftPlanMany seems to be working only on the first row.

#include <stdio.h>

#include <cufft.h>

typedef cuFloatComplex cF;

#define CUDA(call) do {                                         \

        cudaError_t _e = (call);                                \

        if (_e == cudaSuccess) break;                           \

        fprintf(stderr, __FILE__":%d: cuda failure: %s (%d)",   \

                __LINE__, cudaGetErrorString(_e), _e);          \

        exit(-1);                                               \

    } while (0)

#define CUFFT(call) do {                                    \

        cufftResult_t _e = (call);                          \

        if (_e == CUFFT_SUCCESS) break;                     \

        fprintf(stderr, __FILE__":%d: cufft failure: (%d)", \

                __LINE__, _e);                              \

        exit(-1);                                           \

    } while (0)

void show(cF *in, int rows, int cols, char *str)

{

    printf("%s=\n", str);

unsigned bytes = sizeof(cF) * rows * cols;

    cF *in_h = (cF *)malloc(bytes);

    CUDA(cudaMemcpy(in_h, in, bytes, cudaMemcpyDeviceToHost));

    for (int i=0; i < rows; i++) {

        for(int j=0; j < cols; j++) {

            printf("%f %fi  ", in_h[i + j* rows].x, in_h[i + j * rows].y);

        }

        printf("\n");

    }

    free(in_h);

}

int main()

{

    int rows = 4;

    int cols = 4;

    unsigned bytes = sizeof(cF) * rows * cols;

cF *d_in, *d_out;

    CUDA(cudaMalloc(&d_in , bytes));

    CUDA(cudaMalloc(&d_out, bytes));

cF *h_in = (cF *)malloc(bytes);

    for (int i=0; i < rows * cols; i++)

        h_in[i] = make_float2((float)rand()/RAND_MAX, (float)rand()/RAND_MAX);

// Move data to device

    CUDA(cudaMemcpy(d_in, h_in, bytes, cudaMemcpyHostToDevice));

// Print input

    show( d_in, rows, cols, "d_in");

// Zero out output buffer

    CUDA(cudaMemset(d_out, 0, bytes));

// create FFT Plan

    cufftHandle plan;

    int stride = rows;

    int rank   = cols; // Number of points

    int length = cols;

// 1D FFT Plan

    CUFFT(cufftPlanMany(&plan, 1, &rank, &rows, stride, sixteen,

                        &sixteen, stride, rows, CUFFT_C2C, 4));

// Execute forward FFT

    CUFFT(cufftExecC2C(plan, (cF *)d_in, (cF *)d_out, CUFFT_FORWARD));

show( d_out, rows, cols, "d_out");

CUFFT(cufftDestroy(plan));

}

$ ./a.out

d_in=

0.394383 0.840188i  0.553970 0.277775i  0.717297 0.635712i  0.400944 0.156679i

0.798440 0.783099i  0.628871 0.477397i  0.606969 0.141603i  0.108809 0.129790i

0.197551 0.911647i  0.513401 0.364784i  0.242887 0.016301i  0.218257 0.998924i

0.768230 0.335223i  0.916195 0.952230i  0.804177 0.137232i  0.839112 0.512932i

d_out=

2.066594 1.910353i  -0.201818 0.051450i  0.156766 1.041446i  -0.444010 0.357502i

0.000000 0.000000i  0.000000 0.000000i  0.000000 0.000000i  0.000000 0.000000i

0.000000 0.000000i  0.000000 0.000000i  0.000000 0.000000i  0.000000 0.000000i

0.000000 0.000000i  0.000000 0.000000i  0.000000 0.000000i  0.000000 0.000000i

This should work with CUDA 4.1.

cufftPlanMany is the right function to use:
-for the input: the stride is rows, the distance is 1
-for the output: the stride is rows, the distance is 1 ( if you want the same layout otherwise, swap them)
-batch = cols

Thanks so much. This works with CUDA 4.0 as well.!!!