Hello.
My project has a lot of Fourier transforms, mostly one-dimensional transformations of matrix rows and columns.
Matrix dimentions = 8192x8192 cu Complex.
fft by row is pretty fast - ~6ms.
But for conversion by columns the time is abnormally long - ~1.5 second
, and I suspect that I am doing something wrong.
I’ll attach a small test of how I perform Fourier.
I use dev Kit AGX Orin 32GB H01
Before running the test i run sudo jetson_clocks, and set power plan - MAXN.
// nvcc -o fft_test.o fft_test.cu -lcufft
#include <math.h>
#include <stdio.h>
#include <cuComplex.h>
#include <cufft.h>
#define BLOCK_DIM 16
__global__ void data_set(cuComplex *data, float value, int col, int row)
{
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
if ( (xIndex < row) && (yIndex < col) ){
int matIndex = yIndex * row + xIndex;
data[matIndex].x = value;
data[matIndex].y = 0;
}
}
int main()
{
dim3 gridSize;
dim3 blockSize;
float milliseconds = 0;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int n_col = 8192;
int n_row = 8192;
cuComplex *gpu_data;
cudaMalloc( (void**)&gpu_data, n_col*n_row*sizeof(cuComplex) );
cudaDeviceSynchronize();
gridSize = dim3(n_row/BLOCK_DIM,n_col/BLOCK_DIM,1);
blockSize = dim3(BLOCK_DIM,BLOCK_DIM,1);
data_set<<<gridSize,blockSize>>>(gpu_data,1,n_col,n_row);
cufftHandle row_plan;
int row_n[] = {n_row};
int row_inembed[1] = {n_row};
int row_onembed[1] = {n_row};
int row_istride = 1;
int row_idist = n_row;
int row_ostride = 1;
int row_odist = n_row;
int row_batch = n_row;
cufftPlanMany(&row_plan, 1, row_n, row_inembed, row_istride, row_idist, row_onembed, row_ostride, row_odist, CUFFT_C2C, row_batch);
cufftHandle col_plan;
int col_n[] = {n_col};
int col_inembed[1] = {n_col};
int col_onembed[1] = {n_col};
int col_istride = n_col;
int col_idist = 1;
int col_ostride = n_col;
int col_odist = 1;
int col_batch = n_col;
cufftPlanMany(&col_plan, 1, col_n, col_inembed, col_istride, col_idist, col_onembed, col_ostride, col_odist, CUFFT_C2C, col_batch);
// row fft
cudaEventRecord(start);
cufftExecC2C(row_plan, gpu_data, gpu_data, CUFFT_FORWARD);
cudaDeviceSynchronize();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("row fft execotion time : %f ms\n", milliseconds);
// 6 ms
// column fft
cudaEventRecord(start);
cufftExecC2C(col_plan, gpu_data, gpu_data, CUFFT_FORWARD);
cudaDeviceSynchronize();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("column fft execotion time : %f ms\n", milliseconds);
// 1.500 s
}
nvidia@nvidia-desktop:~$ ./fft_test.o
row fft execotion time : 5.611392 ms
column fft execotion time : 1625.680298 ms