Half2 cufft performance

I tested f16 cufft and float cufft on V100 and it’s based on Linux,but the thoughput of f16 cufft didn’t show much performance improvement. The following is the code.

void half_precision_fft_demo()
{
int fft_size = 16384;
int block_size = 1024;
int grid_size = (int)((fft_size + block_size - 1) / block_size);

int loop;
loop = 1000;
cuComplex* dev_complex;
cuComplex* dev_complex_o;
half2*     dev_h_complex;
half2*     dev_h_complex_o;
CUDA_CALL(cudaMalloc((void**)&dev_complex, fft_size * sizeof(cuComplex)));
CUDA_CALL(cudaMalloc((void**)&dev_complex_o, fft_size * sizeof(cuComplex)));
CUDA_CALL(cudaMalloc((void**)&dev_h_complex, fft_size * sizeof(half2)));
CUDA_CALL(cudaMalloc((void**)&dev_h_complex_o, fft_size * sizeof(half2)));

float* host_complex = (float*)malloc(fft_size * sizeof(float) * 2);

// todo 生成一组进行FFT的数据,GPU端
get_fft_sig << <grid_size, block_size >> > ((cuComplex*)dev_complex);
cudaError error_check;
error_check = cudaGetLastError();
if (error_check != cudaSuccess) {
	printf("%s\n", cudaGetErrorString(error_check));
	system("pause");
	//return 0;
}
CUDA_CALL(cudaMemcpy(host_complex, dev_complex, fft_size * sizeof(cuComplex), cudaMemcpyDeviceToHost));


// todo 将生成的FFT数据保存下来
FILE* signal_data = fopen("signal_data.dat", "wb");
fwrite(host_complex, sizeof(cufftComplex), fft_size, signal_data);
fclose(signal_data);

/**************************************************************/
//single float FFT 

cufftHandle handle;
cufftResult_t cufft_status;

// FFT

#ifndef CUFFTTX

-- - Batched 1D FFTs

	int rank = 1;                           // --- 1D FFTs
int n[] = { fft_size };                 // --- 每一维度FFT的点数
int istride = 1;           // --- 每个FFT内部元素的距离
int	ostride = 1;           // --- Distance between two successive input/output elements
int idist = fft_size;
int odist = fft_size;      // --- Distance between batches
int inembed[] = { fft_size };                  // 表明原始输入数据的维度,
int onembed[] = { fft_size };                  // 表明输出数据的维度,
int batch = 1;								   // 一共做多少次FFT 

cufftCreate(&handle);
cufft_status = cufftPlanMany(&handle, rank, n,
	inembed, istride, idist,
	onembed, ostride, odist, CUFFT_C2C, 1);
if (cufft_status != CUFFT_SUCCESS)
{
	assert(false);
}

#else

//cufftHandle handlext;
int rankxt = 1;                           // --- 1D FFTs
long long nxt[] = { fft_size };                 // --- 每一维度FFT的点数
long long istridext = 1;           // --- 每个FFT内部元素的距离
long long ostridext = 1;           // --- Distance between two successive input/output elements
long long idistxt = fft_size;
long long odistxt = fft_size;      // --- Distance between batches
long long inembedxt[] = { fft_size };                  // 表明原始输入数据的维度,
long long onembedxt[] = { fft_size };                  // 表明输出数据的维度,
long long batchxt = 1;								   // 一共做多少次FFT 
size_t worksizext[] = { fft_size * sizeof(cuComplex) };

cufftCreate(&handle);
cufft_status = cufftXtMakePlanMany(handle, rankxt, nxt,
	inembedxt, istridext, idistxt, CUDA_C_32F,
	onembedxt, ostridext, odistxt, CUDA_C_32F,
	batchxt, worksizext, CUDA_C_32F);

if (cufft_status != CUFFT_SUCCESS)
{
	assert(false);
}

#endif // !CUFFTTX

// 设置运行次数,统计时间
loop = 1000;
CUDA_TIMER_BEGIN;
for (int i = 0; i < loop; i++)     // 多次运行统计时间 
{
	// ---- Calculate fft
	cufftXtExec(handle, dev_complex, dev_complex_o, CUFFT_FORWARD);
}

CUDA_TIMER_END
	// --- Device->Host copy of the results
	CUDA_CALL(cudaMemcpy(host_complex, dev_complex_o, fft_size * sizeof(cufftComplex), cudaMemcpyDeviceToHost));

// todo 保存数据文件

FILE* fft_out_data = fopen("fft_out_data.dat", "wb");
fwrite(host_complex, sizeof(cufftComplex), fft_size, fft_out_data);
fclose(fft_out_data);
cufftDestroy(handle);


/**************************************************************/
//half FFT 

// 将数据转换为 half 

float22Half2Vec << <grid_size, block_size >> > (dev_complex, dev_h_complex, fft_size);

cufftHandle handlehf;
cufftCreate(&handlehf);
cufft_status = cufftXtMakePlanMany(handlehf, rankxt, nxt,
	inembedxt, istridext, idistxt, CUDA_C_16F,
	onembedxt, ostridext, odistxt, CUDA_C_16F,
	batchxt, worksizext, CUDA_C_16F);

// ---- Calculate fft
CUDA_TIMER_BEGIN;
for (int i = 0; i < loop; i++)     // 多次运行统计时间 
{
	cufftXtExec(handlehf, dev_h_complex, dev_h_complex_o, CUFFT_FORWARD);
}
CUDA_TIMER_END;
//  将数据转换为float进行保存

//half* host_h_complex = (half*)malloc(fft_size * sizeof(half2));
half  host_h_complex[32768];
CUDA_CALL(cudaMemcpy(host_h_complex, dev_complex_o, fft_size * sizeof(half2), cudaMemcpyDeviceToHost));


half22Float2Vec << <grid_size, block_size >> > (dev_h_complex_o, dev_complex_o, fft_size);
// --- Device->Host copy of the results
CUDA_CALL(cudaMemcpy(host_complex, dev_complex_o, fft_size * sizeof(float2), cudaMemcpyDeviceToHost));

FILE* fp16fft_out_data = fopen("fp16fft_out_data.dat", "wb");
fwrite(host_complex, sizeof(cufftComplex), fft_size, fp16fft_out_data);
fclose(fp16fft_out_data);

cufftDestroy(handlehf);
// destroy 


CUDA_CALL(cudaFree(dev_complex));
CUDA_CALL(cudaFree(dev_h_complex));
CUDA_CALL(cudaFree(dev_complex_o));
CUDA_CALL(cudaFree(dev_h_complex_o));
getchar();

}