/** * Compile with: * nvcc fft32_vs_16.cpp -L/usr/local/cuda-11.8/lib64 -lcudart -lcufft -o fft_benchmark */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define H2D cudaMemcpyHostToDevice #define D2H cudaMemcpyDeviceToHost extern "C" { inline void gpuAssert(cudaError_t code, const char *file, int32_t line, bool abort = true) { if (code != cudaSuccess) { fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } #define ERROR_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__);} #define CUDA_MALLOC(dptr, N, dtype) ERROR_CHECK(cudaMalloc((void**)&dptr, N*sizeof(dtype))) #define CUDA_MALLOC_HOST(dptr, N, dtype) ERROR_CHECK(cudaMallocHost((void**)&dptr, N*sizeof(dtype))) #define CUDA_UPLOAD(dptr, hptr, N, dtype) ERROR_CHECK(cudaMemcpy(dptr, hptr, N*sizeof(dtype), H2D)) #define CUDA_DOWNLOAD(hptr, dptr, N, dtype) ERROR_CHECK(cudaMemcpy(hptr, dptr, N*sizeof(dtype), D2H)) } // extern "C" // FFT shape const int data_shape[2] = {2048, 1024}; void benchmark_fp32() { printf("============ float32 fft test ===============\n"); float2 *h_inp, *h_op; float2 *d_ip, *d_op; h_inp = (float2*) malloc(data_shape[0] * data_shape[1] * sizeof(float2)); h_op = (float2*) malloc(data_shape[0] * data_shape[1] * sizeof(float2)); CUDA_MALLOC(d_ip, data_shape[0] * data_shape[1], float2); CUDA_MALLOC(d_op, data_shape[0] * data_shape[1], float2); size_t n_elem = data_shape[0] * data_shape[1]; for(size_t i = 0; i < n_elem; i++) { h_inp[i].x = rand() / RAND_MAX; h_inp[i].y = rand() / RAND_MAX; } CUDA_UPLOAD(d_ip, h_inp, n_elem, float2); cufftResult ret; cufftHandle fft_h; int rank = 2; int n[2] = {data_shape[0], data_shape[1]}; int inembed[2] = {data_shape[0], data_shape[1]}; int istride = 1; int idist = 1; int onembed[2] = {data_shape[0], data_shape[1]}; int ostride = 1; int odist = 1; cudaEvent_t start, stop; float elapsed; cudaEventCreate(&start); cudaEventCreate(&stop); ret = cufftPlanMany(&fft_h, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_C2C, 1); if(ret == CUFFT_SUCCESS) { // warm up. ret = cufftExecC2C(fft_h, d_ip, d_op, CUFFT_FORWARD); for(int itrial = 0; itrial < 10; itrial++) { cudaEventRecord(start); ret = cufftExecC2C(fft_h, d_ip, d_op, CUFFT_FORWARD); cudaEventRecord(stop); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed, start, stop); if(ret == CUFFT_SUCCESS) { printf("trial #%d, elapsed: %3.4f ms\n", itrial, elapsed); } else { printf("cufftExec failed with code: %d\n", ret); } } } else { printf("cufftPlanMany failed with code: %d\n", ret); } cudaFree(d_ip); cudaFree(d_op); free(h_inp); free(h_op); } void benchmark_fp16() { printf("============ float16 fft test ===============\n"); half2 *h_inp, *h_op; half2 *d_ip, *d_op; h_inp = (half2*) malloc(data_shape[0] * data_shape[1] * sizeof(half2)); h_op = (half2*) malloc(data_shape[0] * data_shape[1] * sizeof(half2)); CUDA_MALLOC(d_ip, data_shape[0] * data_shape[1], half2); CUDA_MALLOC(d_op, data_shape[0] * data_shape[1], half2); size_t n_elem = data_shape[0] * data_shape[1]; for(size_t i = 0; i < n_elem; i++) { h_inp[i].x = __double2half(rand() / RAND_MAX); h_inp[i].y = __double2half(rand() / RAND_MAX); } CUDA_UPLOAD(d_ip, h_inp, n_elem, half2); cufftResult ret; cufftHandle fft_h; int rank = 2; long long int n[2] = {data_shape[0], data_shape[1]}; long long int inembed[2] = {data_shape[0], data_shape[1]}; long long int istride = 1; long long int idist = 1; long long int onembed[2] = {data_shape[0], data_shape[1]}; long long int ostride = 1; long long int odist = 1; long long int batch = 1; cudaEvent_t start, stop; float elapsed; cudaEventCreate(&start); cudaEventCreate(&stop); size_t workSize = 1000000; ret = cufftCreate(&fft_h); ret = cufftXtMakePlanMany(fft_h, rank, n, inembed, istride, idist, CUDA_C_16F, onembed, ostride, odist, CUDA_C_16F, batch, &workSize, CUDA_C_16F); printf("workSize: %d\n", workSize); if(ret == CUFFT_SUCCESS) { // warm up. ret = cufftXtExec(fft_h, d_ip, d_op, CUFFT_FORWARD); for(int itrial = 0; itrial < 10; itrial++) { cudaEventRecord(start); ret = cufftXtExec(fft_h, d_ip, d_op, CUFFT_FORWARD); cudaEventRecord(stop); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed, start, stop); if(ret == CUFFT_SUCCESS) { printf("trial #%d, elapsed: %3.4f ms\n", itrial, elapsed); } else { printf("cufftExec() failed with code: %d\n", ret); } } } else { printf("cufftXtMakePlanMany() failed with code: %d\n", ret); } cudaFree(d_ip); cudaFree(d_op); free(h_inp); free(h_op); } int main(int argc, char** argv) { printf("FFT shape: (%d, %d)\n", data_shape[0], data_shape[1]); benchmark_fp32(); benchmark_fp16(); return 0; }