Hi,
thank you for your reply.
I use a NVIDIA Jetson AGX Xavier with 32 GB LPDDR4
- Jetpack 4.4 DP [L4T 32.4.2]
- Linux jetson-01 4.9.140-tegra #1 SMP PREEMPT Wed Apr 8 18:15:20 PDT 2020 aarch64 aarch64 aarch64 GNU/Linux
- NV Power Mode: MAXN - Type: 0
- Board:
- Board(s): P2888-0001, P2822-0000
- Code Name: galen
- GPU-Arch: 7.2
- SN: 1420820014613
- SOC: tegra194 - ID:25
- Type: AGX Xavier
theoretical performance
The webpage https://devblogs.nvidia.com/nvidia-jetson-agx-xavier-32-teraops-ai-robotics/?ncid=so-fac-mdjngxxrmllhml-69163 asserts that the “Jetson AGX Xavier integrated Volta GPU […] provides 512 CUDA cores and 64 Tensor Cores for up to 11 TFLOPS FP16”. There is no source FP32.
For consumer GPUs like the RTX2080Ti I get theoretical performance numbers from e.g. NVIDIA GeForce RTX 2080 Ti Specs | TechPowerUp GPU Database (I have not found a Nvidia link with FP16 performance numbers). The FP32 performance is given with 13.45 TFLOPS for FP32 and 26.9 TFLOPS for FP16.
When I multiply the boost clock (1.545 GHz) times the number of Cuda-Cores (4352) times two (MAC-Operation) I get the theoretical performance for FP32: 13.45 TFLOPS.
I did the same calculation for the Jetson: 1.4 GHz x 512 CudaCores x 2 and got 1.4 TFLOPS (FP32). However, this number is far lower as expected: factor 7.8 instead of 2 between float and half. So I assume that my calculation for FP32 is not correct.
benchmark
My source code to compare FP16 and FP32 FFTs is attached at the end.
I get the following numbers show the total time and the (time per 2D-FFT).
float2 (FP32)
$ ./fft_benchmark 0
============ Results ============
0.45727s (0.0017862s)
0.45041s (0.0017594s)
0.45718s (0.0017859s)
0.45253s (0.0017677s)
and half2 (FP16)
$ ./fft_benchmark 1
============ Results ============
0.85723s (0.0033486s)
0.85767s (0.0033503s)
0.85755s (0.0033498s)
0.85751s (0.0033497s)
The transformation with FP32 are by factor two faster than FP16. This should be reverse.
Best regards,
Kacie
Source Code:
fft_benchmark.cu
#include <bits/stdc++.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftw.h>
#include <cuda_fp16.h>
#include <helper_cuda.h>
#include <helper_functions.h>
#include <cufftXt.h>
#include <type_traits>
#include <cmath>
unsigned int seed = 82586461;
half2 gen_rand_half(){
return half2 {
half(static_cast<float>(rand_r(&seed) % 64 - 32)/16),
half(static_cast<float>(0))
};
}
float2 gen_rand_float(){
return float2 {
static_cast<float>(rand_r(&seed) % 64 -32)/16,
static_cast<float>(0)
};
}
//template <typename T>
//void perform(int width, int height, int batch_size, int iteration_count, int repetitions)
template <typename T>
void perform(const std::vector<T>& h_data, int width, int height, int batch_size, int iteration_count, int repetitions)
{
auto frame_size = width * height;
// create device pointer
cufftComplex* d_data;
cufftComplex* d_data_transformed;
size_t d_data_bytes = sizeof(T) * frame_size * batch_size;
cudaMalloc((void **)&d_data, d_data_bytes);
cudaMalloc((void **)&d_data_transformed, d_data_bytes);
cudaMemcpy(d_data, h_data.data(), d_data_bytes, cudaMemcpyHostToDevice);
std::vector<double> timings(repetitions, -1.0);
int rank = 2;
long long n[rank] = {height, width};
long long int idist = height * width;
long long int odist = height * width;
long long inembed[] = {height, width};
long long onembed[] = {height, width};
int istride = 1;
int ostride = 1;
size_t workSize = 0;
cufftHandle plan;
cufftCreate(&plan);
if(std::is_same<T, half2>::value){
cufftXtMakePlanMany(
plan // cufftHandle plan
, rank // int rank,
, n // long long *n,
, inembed // long long *inembed,
, istride // long long istride,
, idist // long long idist,
, CUDA_C_16F // cudaDataType inputtype,
, onembed // long long *onembed,
, ostride // long long ostride,
, odist // long long odist,
, CUDA_C_16F // cudaDataType outputtype,
, batch_size // long long batch,
, &workSize // size_t *workSize,
, CUDA_C_16F // cudaDataType executiontype
);
} else if(std::is_same<T, float2>::value) {
cufftXtMakePlanMany(
plan // cufftHandle plan
, rank // int rank,
, n // long long *n,
, inembed // long long *inembed,
, istride // long long istride,
, idist // long long idist,
, CUDA_C_32F // cudaDataType inputtype,
, onembed // long long *onembed,
, ostride // long long ostride,
, odist // long long odist,
, CUDA_C_32F // cudaDataType outputtype,
, batch_size // long long batch,
, &workSize // size_t *workSize,
, CUDA_C_32F // cudaDataType executiontype
);
}
// warmup
cufftXtExec(plan, d_data, d_data_transformed, CUFFT_FORWARD);
cudaDeviceSynchronize();
for(int r = 0; r < repetitions; ++r){
// perform benchmark
auto start = std::chrono::steady_clock::now();
#pragma unroll
for (int i = 0; i < iteration_count; ++i){
cufftXtExec(plan, d_data, d_data_transformed, CUFFT_FORWARD);
cudaDeviceSynchronize();
}
// TODO use cuda events to time kernel execution?
auto end = std::chrono::steady_clock::now();
// timing evaluation
double time_taken = static_cast<double>(std::chrono::duration_cast<std::chrono::microseconds>(end - start).count()) / 1000 / 1000;
timings.at(r) = time_taken;
}
std::cout << "============ Results ============" << std::endl;
for(auto e : timings){
std::cout << std::setw(10) << std::setprecision(5) << e;
std::cout << "s (" << std::setprecision(5);
std::cout << e/(batch_size * iteration_count) << "s)" << std::endl;
}
cufftDestroy(plan);
}
void perform_(int type, int width, int height, int batch_size, int iteration_count, int repetitions){
if(type == 0){
std::vector<float2> h_data(width*height * batch_size);
std::generate(h_data.begin(), h_data.end(), gen_rand_float);
perform<float2>(h_data, width, height, batch_size, iteration_count, repetitions);
} else if(type == 1){
std::vector<half2> h_data(width*height * batch_size);
std::generate(h_data.begin(), h_data.end(), gen_rand_half);
perform<half2>(h_data, width, height, batch_size, iteration_count, repetitions);
}
}
main.ccp:
#include <iostream>
extern void perform_(int type, int width, int height, int batch_size, int iteration_count, int repetitions);
int main(int argc, char **argv)
{
int type = 0;
if(argc > 1){
type = strtol(argv[1], NULL, 10);
}
int width = 2048;
int height = 2048;
int batch_size = 16;
int iteration_count = 16;
int repetitions = 4;
perform_(type, width, height, batch_size, iteration_count, repetitions);
return 0;
}