Conditions on NVJet kernels on Jetson Thor

I’m using CuBLAS to do some simple benchmarking on Jetson Thor.

I found that if I’m using FP16 + FP16 acc, CuBLAS would

  1. Select `cutlass3x_sm100_tensorop_h256x256x16gemm_f16_f16_f16_f16_f16_256x256x64_0_ttn_align8_2sm_bias_f16_relu_stream_k` kernel
  2. Achieve ~60 TFLOPs

And if I’m using FP16 + FP32 acc, CuBLAS would

  1. Select `nvjet_hsh_128x256_64x6_2x1_2cta_v_bz_TTT` kernel
  2. Achieve ~160 TFLOPs

I searched on the Internet and found that the nvjet kernel seems to be tuned for jetson platform. I’m wondering why it’s not used by CuBLAS if we are using FP16 acc?

Is it only supported for mixed precision inference? Does that suggest us to use mix-precision instead of pure FP16?

Hi,

nvJet is cuBLAS backends and not specific to Jetson.

Is the same test you used for the two benchmarking?
Could you share a sample with us so we can check it further internally?

Thanks.

Thanks for replying.

Yes, the results are using the same script only changing the cuBLAS option from CUBLAS_COMPUTE_16F to CUBLAS_COMPUTE_32F.

Here’s the script I used

// tc_gemm_fp16_acc16.cu
// Tensor Core FP16 GEMM benchmark using cuBLAS with **FP16 accumulation**.
//
// Build:
//   nvcc -O3 -std=c++17 tc_gemm_fp16_acc16.cu -o tc_gemm_fp16_acc16 -lcublas
//
// Run examples:
//   ./tc_gemm_fp16_acc16 --m=4096 --n=4096 --k=4096 --iters=200 --warmup=20
//
// Notes:
// - FP16 inputs/outputs with FP16 accumulation (CUBLAS_COMPUTE_16F).
// - Requests Tensor Cores via CUBLAS_GEMM_DEFAULT_TENSOR_OP and TENSOR_OP_MATH.
// - TFLOP/s = (2*M*N*K * iters) / elapsed_time / 1e12.
// - Sizes that are multiples of 128/256 often map best to TC tiles.

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include <iostream>
#include <iomanip>
#include <cuda_fp16.h>

#define CUDA_CHECK(call) do { \
  cudaError_t err = (call); \
  if (err != cudaSuccess) { \
    fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
    std::exit(EXIT_FAILURE); \
  } \
} while (0)

#define CUBLAS_CHECK(call) do { \
  cublasStatus_t st = (call); \
  if (st != CUBLAS_STATUS_SUCCESS) { \
    fprintf(stderr, "cuBLAS error %s:%d: status %d\n", __FILE__, __LINE__, (int)st); \
    std::exit(EXIT_FAILURE); \
  } \
} while (0)

struct Args {
  int m = 8192, n = 8192, k = 8192;
  int iters = 200;
  int warmup = 20;
  bool column_major = false; // default: treat buffers as row-major and use transpose trick
};

Args parse(int argc, char** argv) {
  Args a;
  for (int i=1;i<argc;++i) {
    std::string s(argv[i]);
    auto val = [&](const char* k)->const char* {
      if (s.rfind(k,0)==0) { const char* p = s.c_str()+strlen(k); if (*p=='=') ++p; return p; }
      return nullptr;
    };
    if (s=="--help"||s=="-h") {
      std::cout << "Usage: " << argv[0]
                << " [--m=M] [--n=N] [--k=K] [--iters=N] [--warmup=N] [--colmajor]\n";
      std::exit(0);
    }
    if (s.rfind("--m",0)==0) a.m = std::stoi(val("--m"));
    else if (s.rfind("--n",0)==0) a.n = std::stoi(val("--n"));
    else if (s.rfind("--k",0)==0) a.k = std::stoi(val("--k"));
    else if (s.rfind("--iters",0)==0) a.iters = std::stoi(val("--iters"));
    else if (s.rfind("--warmup",0)==0) a.warmup = std::stoi(val("--warmup"));
    else if (s=="--colmajor") a.column_major = true;
  }
  return a;
}

// Initialize FP16 buffer with pseudo-random values (GPU-side)
__global__ void init_half(__half* data, size_t n, unsigned long long seed) {
  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) {
    unsigned long long x = seed ^ (1469598103934665603ull + i*1099511628211ull);
    x ^= x >> 12; x ^= x << 25; x ^= x >> 27;
    float f = (float)(x & 0xFFFF) / 65535.0f - 0.5f; // [-0.5,0.5]
    data[i] = __float2half(f);
  }
}

int main(int argc, char** argv) {
  Args a = parse(argc, argv);

  int dev=0; CUDA_CHECK(cudaGetDevice(&dev));
  cudaDeviceProp prop{}; CUDA_CHECK(cudaGetDeviceProperties(&prop, dev));
  std::cout << "Device: " << prop.name << "\n";
  std::cout << "GEMM size: M=" << a.m << " N=" << a.n << " K=" << a.k << "\n";
  std::cout << "Iters: " << a.iters << " Warmup: " << a.warmup << "\n";

  const size_t elemsA = (size_t)a.m * a.k;
  const size_t elemsB = (size_t)a.k * a.n;
  const size_t elemsC = (size_t)a.m * a.n;

  __half* dA=nullptr; __half* dB=nullptr; __half* dC=nullptr;
  CUDA_CHECK(cudaMalloc(&dA, elemsA * sizeof(__half)));
  CUDA_CHECK(cudaMalloc(&dB, elemsB * sizeof(__half)));
  CUDA_CHECK(cudaMalloc(&dC, elemsC * sizeof(__half)));

  // Initialize A,B; zero C
  {
    dim3 blk(256);
    dim3 grdA((unsigned)((elemsA + blk.x - 1) / blk.x));
    dim3 grdB((unsigned)((elemsB + blk.x - 1) / blk.x));
    init_half<<<grdA, blk>>>(dA, elemsA, 0xA5A5A5A5ull);
    init_half<<<grdB, blk>>>(dB, elemsB, 0x5A5A5A5Aull);
    CUDA_CHECK(cudaMemset(dC, 0, elemsC * sizeof(__half)));
    CUDA_CHECK(cudaDeviceSynchronize());
  }

  cublasHandle_t handle; CUBLAS_CHECK(cublasCreate(&handle));
  CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));

  // Scalars in FP16 (compute=16F requires alpha/beta as FP16)
  __half alpha = __float2half(1.0f);
  __half beta  = __float2half(0.0f);

  // Layout handling (cuBLAS is column-major)
  cublasOperation_t opA = CUBLAS_OP_N, opB = CUBLAS_OP_N;
  int lda, ldb, ldc;
  int m, n, k;

  if (a.column_major) {
    m = a.m; n = a.n; k = a.k;
    lda = a.m; ldb = a.k; ldc = a.m;
    opA = CUBLAS_OP_N; opB = CUBLAS_OP_N;
  } else {
    // Row-major buffers: compute C^T = B^T * A^T
    m = a.n; n = a.m; k = a.k;          // swap m<->n
    lda = a.k; ldb = a.n; ldc = a.n;
    opA = CUBLAS_OP_T; opB = CUBLAS_OP_T;
  }

  // Warm-up
  for (int w=0; w<a.warmup; ++w) {
    CUBLAS_CHECK(cublasGemmEx(
      handle,
      opB, opA,
      m, n, k,
      &alpha,
      dB, CUDA_R_16F, ldb,
      dA, CUDA_R_16F, lda,
      &beta,
      dC, CUDA_R_16F, ldc,
      CUBLAS_COMPUTE_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
  }
  CUDA_CHECK(cudaDeviceSynchronize());

  // Timed loop
  cudaEvent_t start, stop;
  CUDA_CHECK(cudaEventCreate(&start));
  CUDA_CHECK(cudaEventCreate(&stop));
  CUDA_CHECK(cudaEventRecord(start));
  for (int it=0; it<a.iters; ++it) {
    CUBLAS_CHECK(cublasGemmEx(
      handle,
      opB, opA,
      m, n, k,
      &alpha,
      dB, CUDA_R_16F, ldb,
      dA, CUDA_R_16F, lda,
      &beta,
      dC, CUDA_R_16F, ldc,
      CUBLAS_COMPUTE_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
  }
  CUDA_CHECK(cudaEventRecord(stop));
  CUDA_CHECK(cudaEventSynchronize(stop));

  float ms=0.0f;
  CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));
  double seconds = ms / 1e3;

  // FLOPs per GEMM = 2*M*N*K (FMA = 2 operations)
  const double flops_per = 2.0 * (double)a.m * (double)a.n * (double)a.k;
  const double total_flops = flops_per * (double)a.iters;
  const double tflops = total_flops / seconds / 1e12;

  std::cout << std::fixed << std::setprecision(2);
  std::cout << "Elapsed: " << ms << " ms for " << a.iters << " GEMMs\n";
  std::cout << "Throughput: " << tflops << " TFLOP/s (FP16 inputs, FP16 accumulate, TC)\n";

  // Cleanup
  CUDA_CHECK(cudaEventDestroy(start)); CUDA_CHECK(cudaEventDestroy(stop));
  CUBLAS_CHECK(cublasDestroy(handle));
  cudaFree(dA); cudaFree(dB); cudaFree(dC);
  return 0;
}