How to achieve 56 TFLOPS performance on RTX 500 Ada?

According to the article Nvidia launches the RTX 500 and 1000 Ada Generation laptop GPUs - allegedly up to 14x better generative AI performance than CPU-only RTX 500 Ada in my new Dell Precision 15 (3590) laptop, Intel® Core™ Ultra 7 Processor 165H, 16 GB RAM, 512 GB SSD should reach 56 TFLOPS.

I used AI to create a test program using C++20, CUDA 12.8, compute capability 8.9, VS 2022, Windows 11, which gives the following results:

[CUDA Device Info]
Name: NVIDIA RTX 500 Ada Generation Laptop GPU
CUDA Cores/SM: approx (arch-dep)
SMs: 16, Clock Rate: 2025 MHz
Estimated Peak FP16 Tensor Core TFLOPS: 56

[Tensor Core Benchmark Result]
Matrix size: 1024 x 1024, Batch size: 16
Time elapsed (avg): 1.41251 ms
Performance: 24.3254 TFLOPS
Efficiency: 43.4382 %

[Tensor Core Benchmark Result]
Matrix size: 2048 x 2048, Batch size: 16
Time elapsed (avg): 11.1051 ms
Performance: 24.7525 TFLOPS
Efficiency: 44.2008 %

[Tensor Core Benchmark Result]
Matrix size: 4096 x 4096, Batch size: 16
Time elapsed (avg): 75.5098 ms
Performance: 29.1224 TFLOPS
Efficiency: 52.0042 %

[Tensor Core Benchmark Result]
Matrix size: 8192 x 8192, Batch size: 1
Time elapsed (avg): 38.3489 ms
Performance: 28.6713 TFLOPS
Efficiency: 51.1987 %

[Tensor Core Benchmark Result]
Matrix size: 16384 x 16384, Batch size: 1
Time elapsed (avg): 305.76 ms
Performance: 28.7679 TFLOPS
Efficiency: 51.3713 %

How to modify my program to get closer to 56 TFLOPS performance and 100% GPU utilization? Can’t I download NVIDIA’s test program somewhere?

#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <cmath>

#define REPEAT 10
#define WARMUP 5
#define WORKSPACE_SIZE (1 << 28) // 256MB workspace
#define BATCH_SIZE     16       // Larger batch size for better utilization

void checkCuda(cudaError_t result, const char* msg)
{
    if (result != cudaSuccess)
    {
        std::cerr << "CUDA Error: " << msg << " - "
            << cudaGetErrorString(result) << std::endl;
        exit(1);
    }
}

void checkCublas(cublasStatus_t result, const char* msg)
{
    if (result != CUBLAS_STATUS_SUCCESS)
    {
        std::cerr << "cuBLAS Error: " << msg << " (Code: "
            << result << ")" << std::endl;
        exit(1);
    }
}

double estimateTheoreticalTFLOPS(const cudaDeviceProp& prop)
{
    return 56.0; // RTX 500 Ada Laptop GPU peak FP16 Tensor Core TFLOPS
}

bool checkMemoryAvailability(size_t bytes, int N, int batch_size = 1)
{
    size_t free, total;
    checkCuda(cudaMemGetInfo(&free, &total), "get memory info");
    size_t required = 3 * bytes * batch_size + WORKSPACE_SIZE; // For A, B, C matrices plus workspace
    if (free < required)
    {
        std::cerr << "Insufficient GPU memory for N=" << N
            << ", batch_size=" << batch_size
            << ". Required: " << required / (1 << 20)
            << " MB, Available: " << free / (1 << 20) << " MB"
            << std::endl;
        return false;
    }
    return true;
}

void runBenchmark(int N, double theoretical_tflops, cudaStream_t computeStream,
    cudaStream_t transferStream, cublasHandle_t cublasHandle)
{
    size_t bytes = N * N * sizeof(__half);
    int batch_size = (N <= 4096) ? BATCH_SIZE : 1;  // Use larger batch for smaller matrices
    if (!checkMemoryAvailability(bytes, N, batch_size))
    {
        std::cerr << "Skipping benchmark for N=" << N
            << " due to insufficient memory" << std::endl;
        return;
    }

    // Allocate pinned host memory.
    __half* h_A, * h_B, * h_C;
    checkCuda(cudaHostAlloc(&h_A, bytes * batch_size, cudaHostAllocDefault), "alloc pinned A");
    checkCuda(cudaHostAlloc(&h_B, bytes * batch_size, cudaHostAllocDefault), "alloc pinned B");
    checkCuda(cudaHostAlloc(&h_C, bytes * batch_size, cudaHostAllocDefault), "alloc pinned C");

    // Initialize host data.
    for (int b = 0; b < batch_size; ++b)
    {
        for (int i = 0; i < N * N; ++i)
        {
            h_A[b * N * N + i] = __float2half(1.0f);
            h_B[b * N * N + i] = __float2half(1.0f);
        }
    }

    // Allocate device memory.
    __half* d_A, * d_B, * d_C;
    checkCuda(cudaMalloc(&d_A, bytes * batch_size), "alloc A");
    checkCuda(cudaMalloc(&d_B, bytes * batch_size), "alloc B");
    checkCuda(cudaMalloc(&d_C, bytes * batch_size), "alloc C");

    // Asynchronous memory transfers.
    cudaEvent_t transferDone;
    checkCuda(cudaEventCreate(&transferDone), "create transfer done event");
    checkCuda(cudaMemcpyAsync(d_A, h_A, bytes * batch_size, cudaMemcpyHostToDevice, transferStream), "copy A");
    checkCuda(cudaMemcpyAsync(d_B, h_B, bytes * batch_size, cudaMemcpyHostToDevice, transferStream), "copy B");
    checkCuda(cudaEventRecord(transferDone, transferStream), "record transfer done");

    // Create cuBLASLt handle and set computestream.
    cublasLtHandle_t ltHandle;
    checkCublas(cublasLtCreate(&ltHandle), "create Lt handle");
    checkCublas(cublasSetStream(cublasHandle, computeStream), "set stream");

    // Create matrix multiplication descriptor.
    cublasLtMatmulDesc_t matmulDesc;
    checkCublas(cublasLtMatmulDescCreate(&matmulDesc, CUBLAS_COMPUTE_16F, CUDA_R_16F), "create matmul desc");
    cublasOperation_t opN = CUBLAS_OP_N;
    checkCublas(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opN, sizeof(opN)), "set trans B");
    checkCublas(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &opN, sizeof(opN)), "set trans A");
    cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;
    checkCublas(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)), "set epilogue");

    // Create matrix layout descriptors.
    cublasLtMatrixLayout_t layoutA, layoutB, layoutC;
    checkCublas(cublasLtMatrixLayoutCreate(&layoutA, CUDA_R_16F, N, N, N), "layout A");
    checkCublas(cublasLtMatrixLayoutCreate(&layoutB, CUDA_R_16F, N, N, N), "layout B");
    checkCublas(cublasLtMatrixLayoutCreate(&layoutC, CUDA_R_16F, N, N, N), "layout C");

    __half alpha = __float2half(1.0f);
    __half beta = __float2half(0.0f);

    // Create and set matmul preference.
    cublasLtMatmulPreference_t preferenceDesc;
    checkCublas(cublasLtMatmulPreferenceCreate(&preferenceDesc), "create preference");
    size_t workspaceSize = WORKSPACE_SIZE;
    checkCublas(cublasLtMatmulPreferenceSetAttribute(preferenceDesc,
        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)),
        "set workspace size");

    // Get heuristic results for algorithm selection.
    cublasLtMatmulHeuristicResult_t heuristicResults[16];
    int returnedResults = 0;
    checkCublas(cublasLtMatmulAlgoGetHeuristic(ltHandle, matmulDesc, layoutA, layoutB, layoutC, layoutC,
        preferenceDesc, 16, heuristicResults, &returnedResults),
        "get heuristic");
    if (returnedResults == 0)
    {
        std::cerr << "No suitable algorithm found!\n";
        exit(1);
    }

    // Allocate workspace on device.
    void* d_workspace;
    checkCuda(cudaMalloc(&d_workspace, WORKSPACE_SIZE), "alloc workspace");

    // Select the best algorithm via warmup iterations.
    float minTime = std::numeric_limits<float>::max();
    int bestAlgoIdx = 0;
    cudaEvent_t start, stop;
    checkCuda(cudaEventCreate(&start), "create start event");
    checkCuda(cudaEventCreate(&stop), "create stop event");

    for (int algoIdx = 0; algoIdx < returnedResults; ++algoIdx)
    {
        float totalTime = 0.0f;
        // Warmup iterations.
        for (int i = 0; i < WARMUP; ++i)
        {
            checkCuda(cudaStreamWaitEvent(computeStream, transferDone, 0), "wait for transfer");
            for (int b = 0; b < batch_size; ++b)
            {
                checkCublas(cublasLtMatmul(ltHandle, matmulDesc, &alpha,
                    d_A + b * N * N, layoutA,
                    d_B + b * N * N, layoutB,
                    &beta, d_C + b * N * N, layoutC,
                    d_C + b * N * N, layoutC,
                    &heuristicResults[algoIdx].algo,
                    d_workspace, WORKSPACE_SIZE,
                    computeStream),
                    "warmup LtMatmul");
            }
        }
        // Timed iterations.
        for (int i = 0; i < REPEAT; ++i)
        {
            checkCuda(cudaStreamWaitEvent(computeStream, transferDone, 0), "wait for transfer");
            checkCuda(cudaEventRecord(start, computeStream), "record start");
            for (int b = 0; b < batch_size; ++b)
            {
                checkCublas(cublasLtMatmul(ltHandle, matmulDesc, &alpha,
                    d_A + b * N * N, layoutA,
                    d_B + b * N * N, layoutB,
                    &beta, d_C + b * N * N, layoutC,
                    d_C + b * N * N, layoutC,
                    &heuristicResults[algoIdx].algo,
                    d_workspace, WORKSPACE_SIZE,
                    computeStream),
                    "benchmark LtMatmul");
            }
            checkCuda(cudaEventRecord(stop, computeStream), "record stop");
            checkCuda(cudaEventSynchronize(stop), "synchronize stop");
            float ms;
            checkCuda(cudaEventElapsedTime(&ms, start, stop), "elapsed time");
            totalTime += ms;
        }
        float avgTime = totalTime / REPEAT;
        if (avgTime < minTime)
        {
            minTime = avgTime;
            bestAlgoIdx = algoIdx;
        }
    }

    // Capture final multiplication loop with CUDA Graphs to reduce launch overhead.
    cudaGraph_t graph;
    cudaGraphExec_t graphExec;
    checkCuda(cudaStreamBeginCapture(computeStream, cudaStreamCaptureModeGlobal), "begin capture");
    for (int b = 0; b < batch_size; ++b)
    {
        checkCublas(cublasLtMatmul(ltHandle, matmulDesc, &alpha,
            d_A + b * N * N, layoutA,
            d_B + b * N * N, layoutB,
            &beta, d_C + b * N * N, layoutC,
            d_C + b * N * N, layoutC,
            &heuristicResults[bestAlgoIdx].algo,
            d_workspace, WORKSPACE_SIZE,
            computeStream),
            "graph LtMatmul");
    }
    checkCuda(cudaStreamEndCapture(computeStream, &graph), "end capture");
    checkCuda(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0), "instantiate graph");

    // Run captured graph and measure time.
    float totalGraphTime = 0.0f;
    for (int i = 0; i < REPEAT; ++i)
    {
        checkCuda(cudaEventRecord(start, computeStream), "record graph start");
        checkCuda(cudaGraphLaunch(graphExec, computeStream), "launch graph");
        checkCuda(cudaEventRecord(stop, computeStream), "record graph stop");
        checkCuda(cudaEventSynchronize(stop), "synchronize graph stop");
        float ms;
        checkCuda(cudaEventElapsedTime(&ms, start, stop), "elapsed graph time");
        totalGraphTime += ms;
    }
    float avgTimeGraph = totalGraphTime / REPEAT;
    double ops = 2.0 * std::pow(N, 3) * batch_size;  // Count FLOPs
    double tflops = ops / (avgTimeGraph / 1000.0) / 1e12;
    double percent = (tflops / theoretical_tflops) * 100.0;

    std::cout << "\n[Tensor Core Benchmark Result]" << std::endl;
    std::cout << "  Matrix size: " << N << " x " << N << ", Batch size: " << batch_size << std::endl;
    std::cout << "  Time elapsed (avg): " << avgTimeGraph << " ms" << std::endl;
    std::cout << "  Performance: " << tflops << " TFLOPS" << std::endl;
    std::cout << "  Efficiency: " << percent << " %" << std::endl;

    // Cleanup.
    checkCuda(cudaFreeHost(h_A), "free pinned A");
    checkCuda(cudaFreeHost(h_B), "free pinned B");
    checkCuda(cudaFreeHost(h_C), "free pinned C");
    checkCuda(cudaFree(d_A), "free A");
    checkCuda(cudaFree(d_B), "free B");
    checkCuda(cudaFree(d_C), "free C");
    checkCuda(cudaFree(d_workspace), "free workspace");
    checkCuda(cudaEventDestroy(start), "destroy start event");
    checkCuda(cudaEventDestroy(stop), "destroy stop event");
    checkCuda(cudaEventDestroy(transferDone), "destroy transfer done event");
    checkCublas(cublasLtMatmulDescDestroy(matmulDesc), "destroy matmul desc");
    checkCublas(cublasLtMatrixLayoutDestroy(layoutA), "destroy layout A");
    checkCublas(cublasLtMatrixLayoutDestroy(layoutB), "destroy layout B");
    checkCublas(cublasLtMatrixLayoutDestroy(layoutC), "destroy layout C");
    checkCublas(cublasLtMatmulPreferenceDestroy(preferenceDesc), "destroy preference");
    checkCublas(cublasLtDestroy(ltHandle), "destroy Lt handle");
    checkCuda(cudaGraphDestroy(graph), "destroy graph");
    checkCuda(cudaGraphExecDestroy(graphExec), "destroy graph exec");
}

int main()
{
    cudaDeviceProp prop;
    checkCuda(cudaGetDeviceProperties(&prop, 0), "get device properties");
    std::cout << "[CUDA Device Info]" << std::endl;
    std::cout << "  Name: " << prop.name << std::endl;
    std::cout << "  CUDA Cores/SM: approx (arch-dep)" << std::endl;
    std::cout << "  SMs: " << prop.multiProcessorCount
        << ", Clock Rate: " << (prop.clockRate / 1000) << " MHz" << std::endl;

    double theoretical_tflops = estimateTheoreticalTFLOPS(prop);
    std::cout << "  Estimated Peak FP16 Tensor Core TFLOPS: " << theoretical_tflops << std::endl;

    cudaStream_t computeStream, transferStream;
    checkCuda(cudaStreamCreate(&computeStream), "create compute stream");
    checkCuda(cudaStreamCreate(&transferStream), "create transfer stream");

    cublasHandle_t cublasHandle;
    checkCublas(cublasCreate(&cublasHandle), "create cublas handle");

    int sizes[] = { 1024, 2048, 4096, 8192, 16384 }; // Test various matrix sizes.
    for (int N : sizes)
    {
        runBenchmark(N, theoretical_tflops, computeStream, transferStream, cublasHandle);
    }

    checkCublas(cublasDestroy(cublasHandle), "destroy cublas handle");
    checkCuda(cudaStreamDestroy(computeStream), "destroy compute stream");
    checkCuda(cudaStreamDestroy(transferStream), "destroy transfer stream");

    return 0;
}

The table in the article says 147.4 TFLOPs. You read the RTX A500 Ampere generation value instead of the RTX 500 Ada Generation one.

Those TFLOPs are for sparse matrices, so you have to half them for dense matrices.

On a notebook there can be issues with power settings of the operating system, e.g. efficiency vs. performance.

It’s usually not possible to hit the peak theoretical number on any GPU. As already mentioned a significant limiting factor on a laptop and some other GPUs will be power. The laptop GPU does not necessarily have enough power to deliver sustained performance at the highest level. Therefore your 28TF/s number might well be a good estimate of the sustained performance.

If you want to see something higher than that, you may need to shorten the duration of matrix multiply (i.e. adjust the size) and perhaps don’t do batched operations. You will need to launch the work once to overcome various first-time challenges, then wait a while to let power stabilize, then launch the work again. Tune the matrix multiply dimension higher or lower until you find the peak. The objective is to hit the GPU with as much work before the power management kicks in, and then preferably finish up your work at that point.

I forgot to add a graphical monitoring of the program’s execution. It looks like it’s not running at full speed and something is holding it back.

Can you show the row with the Cuda kernel calls (perhaps I missed them)?
You could also benchmark with Nsight Compute.

I can’t find Cuda kernel calls. Where are they?
I see in Nsight Systems:

  • CPU
  • GPU (0000:01>00.0 - NVIDIA
  • GPU (Intel(R) Graphics)
  • Processes

Check which data you are recording (there are checkboxes before starting the recording).

While your test is running, periodically run:

nvidia-smi -q

and take note of the “Clocks Throttle Reasons” and “Power Readings” for signs of limitation.

Clocks Event Reasons
    Idle                              : Not Active
    Applications Clocks Setting       : Not Active
    SW Power Cap                      : Active
    HW Slowdown                       : Not Active
        HW Thermal Slowdown           : Not Active
        HW Power Brake Slowdown       : Not Active
    Sync Boost                        : Not Active
    SW Thermal Slowdown               : Not Active
    Display Clock Setting             : Not Active
Sparse Operation Mode                 : N/A

Utilization
    GPU                               : 100 %
    Memory                            : 100 %
    Encoder                           : 0 %
    Decoder                           : 0 %
    JPEG                              : 0 %
    OFA                               : 0 %

Temperature
    GPU Current Temp                  : 66 C
    GPU T.Limit Temp                  : 21 C
    GPU Shutdown T.Limit Temp         : -12 C
    GPU Slowdown T.Limit Temp         : -2 C
    GPU Max Operating T.Limit Temp    : 0 C
    GPU Target Temperature            : 87 C
    Memory Current Temp               : N/A
    Memory Max Operating T.Limit Temp : N/A
GPU Power Readings
    Power Draw                        : 31.09 W
    Current Power Limit               : 31.14 W
    Requested Power Limit             : 31.14 W
    Default Power Limit               : 30.00 W
    Min Power Limit                   : 5.00 W
    Max Power Limit                   : 35.00 W

Which tends to indicate maximum performance, capped by power consumption. The maximum power limit is 35W and the requested, (presumably by the power management on the laptop), is within 4W.

For non-mobile GPUs the Current Power Limit is set by the user, and can be increased via nvidia-smi up to the Max Power Limit. I have no experience with GPUs in laptops, but a quick experiment should indicate whether raising the limit to the maximum is in fact possible (note: administrative privilege is needed to change environmental limits). The reason this is worth trying is because their still seems to be ample headroom with respect to the temperature limit.

Have you looked at the Windows Energy settings and set the GPU to some high performance setting?