Hi,

```
Thank you for your guidance and instructions. Here are the complete codes:
```

#include “cuda_runtime.h”

#include “device_launch_parameters.h”

#include “cufft.h”

#include <stdio.h>

#define xsize 101

#define ysize 50

#define size xsize*ysize

#define thrds 1024

#define blocks 5

#define grid 1

typedef float2 Complex;

**global** void initzero(cufftComplex *X) {

```
int idx = blockIdx.x * blockDim.x + threadIdx.x;
while (idx < size) {
X[idx].x = 0.0f;
X[idx].y = 0.0f;
idx += blockDim.x*gridDim.x;
}
```

}

int main() {

```
Complex data[101][50];
cufftComplex *d_fft;
cudaMalloc((void **)&d_fft, sizeof(cufftComplex)* size);
initzero <<<blocks, thrds >>> (d_fft);
cudaMemcpy(data, d_fft, size, cudaMemcpyDeviceToHost);
// check initialized data
for (int i = 0; i < xsize; i++) {
printf("i=%d \n", i);
for (int j = 0; j < ysize; j++) {
printf("j=%d %e ", j, data[i][j].x);
}
printf("\n");
}
getchar();
return 0;
```

}

```
I am trying to use 1D d_fft in device and copy initialized values in d_fft to 2D data in host. Both d_fft and data have the same size of elements.
The results show that the first 650 (13 rows and 50 columns) plusing first 32 columns in 14 (13th) row, total 682, are initialized with "0", while rests are all given with same address.
I think I might use wrong thread number and block number to implement kernel function.
Thank you very much!
```

Dawn