I’m developing with NVIDIA’s XAVIER. I plan to implement cuFFT using CUDA, get a profile and check the performance with NVIDIA Visual Profiler.
I will paste part of the source code and the result of profiling it with nvprof.
please confirm.
Input data of 256x64 is read from Excel (omitted), and it is calculated by cuFFT.
so I have questions about nvprof result.
void regular_fft<unsigned int=64, unsigned int=8, unsigned int=32, padding_t=1, twiddle_t=0, loadstore_modifier_t=2, layout_t=1, unsigned int, float>(kernel_arguments_t)
and
void vector_fft<unsigned int=256, unsigned int=16, unsigned int=1, padding_t=6, twiddle_t=0, loadstore_modifier_t=2, layout_t=0, unsigned int, float>(kernel_arguments_t)
What does the above mean?
For example, padding_t = 6, twiddle_t = 0
About the meaning of the value of.
Regars,
Masato
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
//#include <helper_cuda.h>
//#include <helper_functions.h>
#define LEN_X 256
#define LEN_Y 64
#define BATCH 192
static float csv_buf[LEN_Y][LEN_X];
int main()
{
int n[2] = {LEN_Y,LEN_X};
int stat;
int x;
int y;
int pos;
cufftComplex *h_in_data;
cufftComplex *h_in_data_batch;
cufftComplex *h_out_data;
cufftComplex *d_in_data;
cufftComplex *d_out_data;
//for timer
float elapsed_time_ms = 0.0f;
//define event
cudaEvent_t start,stop;
//create event
cudaEventCreate(&start);
cudaEventCreate(&stop);
h_in_data = (cufftComplex *)malloc(sizeof(cufftComplex) * LEN_X * LEN_Y);
h_in_data_batch = (cufftComplex *)malloc(sizeof(cufftComplex) * LEN_X * LEN_Y * BATCH);//for copy Number of BATCH
h_out_data = (cufftComplex *)malloc(sizeof(cufftComplex) * LEN_X * LEN_Y * BATCH);
cudaMalloc(&d_in_data,sizeof(cufftComplex) * LEN_X * LEN_Y * BATCH);
cudaMalloc(&d_out_data,sizeof(cufftComplex) * LEN_X * LEN_Y * BATCH);
cudaMemset(d_in_data, 0, (sizeof(cufftComplex) * LEN_X * LEN_Y * BATCH));
cudaMemset(d_out_data, 0, (sizeof(cufftComplex) * LEN_X * LEN_Y * BATCH));
for(int i = 0; i < BATCH; i++)
{
memcpy(&h_in_data_batch[i * LEN_X * LEN_Y],h_in_data,sizeof(cufftComplex)* LEN_X * LEN_Y);
}
/copy Host to device/
cudaMemcpy(d_in_data,h_in_data_batch,sizeof(cufftComplex) * LEN_X * LEN_Y * BATCH, cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlanMany(&plan,2,n,NULL,1,1,NULL,1,1,CUFFT_C2C,BATCH);
//Timer Start
cudaEventRecord(start,0);
cufftExecC2C(plan, d_in_data, d_out_data, CUFFT_FORWARD);
//Record event
cudaEventRecord(stop,0);
//sync event
cudaEventSynchronize(stop);
//measure time
cudaEventElapsedTime(&elapsed_time_ms,start,stop);
//destroy event
cudaEventDestroy(start);
cudaEventDestroy(stop);
//copy device to HOST
cudaMemcpy(h_out_data, d_out_data, sizeof(cufftComplex) * LEN_X * LEN_Y * BATCH, cudaMemcpyDeviceToHost);
cufftDestroy(plan);
cudaFree(d_in_data);
cudaFree(d_out_data);
return 0;
}
==10779== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 54.85% 5.1925ms 1 5.1925ms 5.1925ms 5.1925ms [CUDA memcpy DtoH]
29.27% 2.7707ms 1 2.7707ms 2.7707ms 2.7707ms [CUDA memcpy HtoD]
6.84% 647.82us 2 323.91us 323.19us 324.63us [CUDA memset]
4.67% 442.34us 1 442.34us 442.34us 442.34us void regular_fft<unsigned int=64, unsigned int=8, unsigned int=32, padding_t=1, twiddle_t=0, loadstore_modifier_t=2, layout_t=1, unsigned int, float>(kernel_arguments_t)
4.37% 413.79us 1 413.79us 413.79us 413.79us void vector_fft<unsigned int=256, unsigned int=16, unsigned int=1, padding_t=6, twiddle_t=0, loadstore_modifier_t=2, layout_t=0, unsigned int, float>(kernel_arguments_t)