Here’s a full test case built around what you have shown:
$ cat t13.cu
#include <cufft.h>
#include <stdio.h>
#include <assert.h>
const int N_VEC = 8; // vertical - # of rows
const int VEC_LEN = 4; // horizontal - # of columns
int main(){
int nCol[1] = {N_VEC};
cufftHandle plan;
cufftResult res;
res=cufftPlanMany (&plan, 1, nCol, //plan, rank, n
NULL, VEC_LEN, 1, //inembed, istride, idist
NULL, VEC_LEN, 1, //oneembed, ostride, odist,
CUFFT_C2C, VEC_LEN); //type, n_batch
assert(res == CUFFT_SUCCESS);
cufftComplex *pSrc, *pDest, *h;
cudaMalloc(&pSrc, N_VEC*VEC_LEN*sizeof(pSrc[0]));
cudaMalloc(&pDest, N_VEC*VEC_LEN*sizeof(pDest[0]));
h = new cufftComplex[N_VEC*VEC_LEN];
for (int i = 0; i < N_VEC*VEC_LEN; i++) {
h[i].x = 1;
h[i].y = 0;}
cudaMemcpy(pSrc, h, N_VEC*VEC_LEN*sizeof(pSrc[0]), cudaMemcpyHostToDevice);
res = cufftExecC2C (plan, pSrc, pDest, CUFFT_FORWARD);
assert(res == CUFFT_SUCCESS);
cudaMemcpy(h, pDest, N_VEC*VEC_LEN*sizeof(pDest[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < N_VEC; i++) {
for (int j = 0; j < VEC_LEN; j++) printf("%f,%f ", h[i*VEC_LEN+j].x, h[i*VEC_LEN+j].y);
printf("\n");}
}
$ nvcc -o t13 t13.cu -lcufft
$ cuda-memcheck ./t13
========= CUDA-MEMCHECK
8.000000,0.000000 0.000000,0.000000 0.000000,0.000000 0.000000,0.000000
0.000000,0.000000 0.000000,0.000000 0.000000,0.000000 0.000000,0.000000
8.000000,0.000000 0.000000,0.000000 0.000000,0.000000 0.000000,0.000000
0.000000,0.000000 0.000000,0.000000 0.000000,0.000000 0.000000,0.000000
8.000000,0.000000 0.000000,0.000000 0.000000,0.000000 0.000000,0.000000
0.000000,0.000000 0.000000,0.000000 0.000000,0.000000 0.000000,0.000000
8.000000,0.000000 0.000000,0.000000 0.000000,0.000000 0.000000,0.000000
0.000000,0.000000 0.000000,0.000000 0.000000,0.000000 0.000000,0.000000
========= ERROR SUMMARY: 0 errors
$
note that the output doesn’t match what you indicated:
when I make the change indicated in comment #3 above, I get possibly the output you are expecting:
$ cat t13.cu
#include <cufft.h>
#include <stdio.h>
#include <assert.h>
const int N_VEC = 8; // vertical - # of rows
const int VEC_LEN = 4; // horizontal - # of columns
int main(){
int nCol[1] = {N_VEC};
cufftHandle plan;
cufftResult res;
res=cufftPlanMany (&plan, 1, nCol, //plan, rank, n
nCol, VEC_LEN, 1, //inembed, istride, idist
nCol, VEC_LEN, 1, //oneembed, ostride, odist,
CUFFT_C2C, VEC_LEN); //type, n_batch
assert(res == CUFFT_SUCCESS);
cufftComplex *pSrc, *pDest, *h;
cudaMalloc(&pSrc, N_VEC*VEC_LEN*sizeof(pSrc[0]));
cudaMalloc(&pDest, N_VEC*VEC_LEN*sizeof(pDest[0]));
h = new cufftComplex[N_VEC*VEC_LEN];
for (int i = 0; i < N_VEC*VEC_LEN; i++) {
h[i].x = 1;
h[i].y = 0;}
cudaMemcpy(pSrc, h, N_VEC*VEC_LEN*sizeof(pSrc[0]), cudaMemcpyHostToDevice);
res = cufftExecC2C (plan, pSrc, pDest, CUFFT_FORWARD);
assert(res == CUFFT_SUCCESS);
cudaMemcpy(h, pDest, N_VEC*VEC_LEN*sizeof(pDest[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < N_VEC; i++) {
for (int j = 0; j < VEC_LEN; j++) printf("%f,%f ", h[i*VEC_LEN+j].x, h[i*VEC_LEN+j].y);
printf("\n");}
}
$ nvcc -o t13 t13.cu -lcufft
$ cuda-memcheck ./t13
========= CUDA-MEMCHECK
8.000000,0.000000 8.000000,0.000000 8.000000,0.000000 8.000000,0.000000
0.000000,0.000000 0.000000,0.000000 0.000000,0.000000 0.000000,0.000000
0.000000,0.000000 0.000000,0.000000 0.000000,0.000000 0.000000,0.000000
0.000000,0.000000 0.000000,0.000000 0.000000,0.000000 0.000000,0.000000
0.000000,0.000000 0.000000,0.000000 0.000000,0.000000 0.000000,0.000000
0.000000,0.000000 0.000000,0.000000 0.000000,0.000000 0.000000,0.000000
0.000000,0.000000 0.000000,0.000000 0.000000,0.000000 0.000000,0.000000
0.000000,0.000000 0.000000,0.000000 0.000000,0.000000 0.000000,0.000000
========= ERROR SUMMARY: 0 errors
$
The choice of pointer parameter here (nCol) for inembed and onembed does not matter for the 1D case (read the advanced layout section doc link already given in comment 2) but it still must be non-NULL if you expect the istride, idist, ostride, odist parameters to be accounted for.