Jetson AGX xavier fp64 performance

Please provide the following info (check/uncheck the boxes after clicking “+ Create Topic”):
Software Version
DRIVE OS Linux 5.2.0
DRIVE OS Linux 5.2.0 and DriveWorks 3.5
NVIDIA DRIVE™ Software 10.0 (Linux)
NVIDIA DRIVE™ Software 9.0 (Linux)
other DRIVE OS version
other

Target Operating System
Linux
QNX
other

Hardware Platform
NVIDIA DRIVE™ AGX Xavier DevKit (E3550)
NVIDIA DRIVE™ AGX Pegasus DevKit (E3550)
other

SDK Manager Version
1.5.0.7774
other

Host Machine Version
native Ubuntu 18.04
other

Hi,

Bought Jetson AGX xavier, installed ubuntu 18.04lts, tried fp32/fp64 performance of 1 giga times of fp32/fp64 add. The following is my kernal function, there are 1000 times kernal calls and each kernal call has about 1 millon times of fp32 or fp64 add. There are 2 runs, one with float “vtype” and another is double “vtype”, checked the ptx code and there is only f32.add vs f64.add difference. For the performance, the 1 giga times of fp32 add run takes about 3.16 second, and the 1 giga times of fp64 add takes about 31.55 second to finish. The fp64 add is about 10x slower than fp32 add, but according Jetson agx xavier spec the fp32/fp64 gflops is 1:2.

So is there any options to enable fp64 fast performance?

Thanks
Henry

--------- kernal function
global void
vectorAdd(vtype* const A) //–> vtype is float or double
{
const auto i = blockDim.x * blockIdx.x + threadIdx.x;
auto a = A[i];
for (auto j = numAddsPerThread / 10; j >0 ; --j) {
a += 1.;
a += 1.;
a += 1.;
a += 1.;
a += 1.;
a += 1.;
a += 1.;
a += 1.;
a += 1.;
a += 1.;
}
A[i] = a;
}

Hi,
This looks like a Jetson issue, moving into Jetson AGX Xavier forum.

Hi,

We want to reproduce this on our environment first.
Could you share the complete source with us?

Thanks.

Thanks for the quick response, here is the source. Line 47 is for fp64(double), plz change to line 48 “typedef float vtype;” for testing fp32(float). Thanks, ~Henry

#include <stdio.h>
// For the CUDA runtime routines (prefixed with “cuda_”)
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include

class jetson
{
public:
class memManager
{
public:
//1.652u 1.396s
class zeroCopy
{
public:
static void* malloc(const unsigned size) {assert(size); void* p =0; checkCudaErrors(cudaMallocHost(&p, size)); assert(p); return p;}
static void* getDevPtr(void* const p) {assert(p); void* p2 =0; checkCudaErrors(cudaHostGetDevicePointer(&p2, p, 0)); assert(p2); return p2;}
static void free(void* const p) {assert(p); checkCudaErrors(cudaFreeHost(p));}
};

//2.324u 4.496s
class unified
{
public:
  static void* malloc(const unsigned size) {assert(size); void* p =0; checkCudaErrors(cudaMallocManaged(&p,  size)); assert(p); return p;}
  static void* getDevPtr(void* const p) {assert(p); return p;}
  static void free(void* const p) {assert(p); checkCudaErrors(cudaFree(p));}
};

};

static const unsigned numWarps = 16;
static const unsigned numThreadsPerWarp = 32;
static const unsigned numThreads = numWarps * numThreadsPerWarp;

static void sync() {cudaDeviceSynchronize();}
};

// to dump ptx, /usr/local/cuda/bin/cuobjdump gflops.o -ptx

//double: 0.044u 0.040s 0:31.55 0.2% 0+0k 0+0io 0pf+0w
//float: 0.032u 0.040s 0:03.16 2.2% 0+0k 0+0io 0pf+0w
//typedef double vtype;
typedef float vtype;

static const auto numAddsPerThread = 1000000;

global void
vectorAdd(vtype* const A)
{
const auto i = blockDim.x * blockIdx.x + threadIdx.x;
auto a = A[i];
for (auto j = numAddsPerThread / 10; j >0 ; --j) {
a += 1.;
a += 1.;
a += 1.;
a += 1.;
a += 1.;
a += 1.;
a += 1.;
a += 1.;
a += 1.;
a += 1.;
}
A[i] = a;
}

/**

  • Host main routine
    */
    int
    main(void)

{
typedef jetson::memManager::unified mm;

// Launch the Vector Add CUDA Kernel
const unsigned numLoops = 1000;
const auto numBlocks = 1;
const auto size = numBlocks * jetson::numThreadsPerWarp;

const auto numAdds = numAddsPerThread * numLoops;

printf(“num calc per thread %1.2f G\n”, numAdds / 1.e9);

//cudaSetDeviceFlags(cudaDeviceMapHost);

const auto h_A = (vtype*)mm::malloc(size * sizeof(vtype));
const auto d_A = (vtype*)mm::getDevPtr(h_A); 
for (int i = 0; i < jetson::numThreads; ++i) h_A[i] = 0.;

for (auto i = 0; i < numLoops; ++i) {
  //1000x  launch: 0.060u 0.020s
  //10000x launch:dit 2.180u 3.372s
  vectorAdd<<<numBlocks, 1>>>(d_A);
}
jetson::sync();

const auto err = cudaGetLastError();
if (err != cudaSuccess)
{
    fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
}

printf("CUDA kernel done\n");


// Verify that the result vector is correct
for (int i = 0; i < size; ++i) {
  if (h_A[i] != numAdds) {
        fprintf(stderr, "Result verification failed at element %d!\n", i);
        fprintf(stderr, "%1f != %f \n", h_A[i] / 1.e9, numAdds / 1.e9);
        exit(EXIT_FAILURE);
    }
}

printf("Test PASSED\n");

mm::free(h_A);

printf("Done\n");
return 0;

}

Any unknown factors/difficulties to repeat these fp32/fp64 performance number?

Hi, AastaLLL, did anyone look into this? can they repeat? found any issue or recommand any options to resolve this? Thanks, ~Henry