According to the article Nvidia launches the RTX 500 and 1000 Ada Generation laptop GPUs - allegedly up to 14x better generative AI performance than CPU-only RTX 500 Ada in my new Dell Precision 15 (3590) laptop, Intel® Core™ Ultra 7 Processor 165H, 16 GB RAM, 512 GB SSD should reach 56 TFLOPS.
I used AI to create a test program using C++20, CUDA 12.8, compute capability 8.9, VS 2022, Windows 11, which gives the following results:
[CUDA Device Info]
Name: NVIDIA RTX 500 Ada Generation Laptop GPU
CUDA Cores/SM: approx (arch-dep)
SMs: 16, Clock Rate: 2025 MHz
Estimated Peak FP16 Tensor Core TFLOPS: 56
[Tensor Core Benchmark Result]
Matrix size: 1024 x 1024, Batch size: 16
Time elapsed (avg): 1.41251 ms
Performance: 24.3254 TFLOPS
Efficiency: 43.4382 %
[Tensor Core Benchmark Result]
Matrix size: 2048 x 2048, Batch size: 16
Time elapsed (avg): 11.1051 ms
Performance: 24.7525 TFLOPS
Efficiency: 44.2008 %
[Tensor Core Benchmark Result]
Matrix size: 4096 x 4096, Batch size: 16
Time elapsed (avg): 75.5098 ms
Performance: 29.1224 TFLOPS
Efficiency: 52.0042 %
[Tensor Core Benchmark Result]
Matrix size: 8192 x 8192, Batch size: 1
Time elapsed (avg): 38.3489 ms
Performance: 28.6713 TFLOPS
Efficiency: 51.1987 %
[Tensor Core Benchmark Result]
Matrix size: 16384 x 16384, Batch size: 1
Time elapsed (avg): 305.76 ms
Performance: 28.7679 TFLOPS
Efficiency: 51.3713 %
How to modify my program to get closer to 56 TFLOPS performance and 100% GPU utilization? Can’t I download NVIDIA’s test program somewhere?
#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <cmath>
#define REPEAT 10
#define WARMUP 5
#define WORKSPACE_SIZE (1 << 28) // 256MB workspace
#define BATCH_SIZE 16 // Larger batch size for better utilization
void checkCuda(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << "CUDA Error: " << msg << " - "
<< cudaGetErrorString(result) << std::endl;
exit(1);
}
}
void checkCublas(cublasStatus_t result, const char* msg)
{
if (result != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "cuBLAS Error: " << msg << " (Code: "
<< result << ")" << std::endl;
exit(1);
}
}
double estimateTheoreticalTFLOPS(const cudaDeviceProp& prop)
{
return 56.0; // RTX 500 Ada Laptop GPU peak FP16 Tensor Core TFLOPS
}
bool checkMemoryAvailability(size_t bytes, int N, int batch_size = 1)
{
size_t free, total;
checkCuda(cudaMemGetInfo(&free, &total), "get memory info");
size_t required = 3 * bytes * batch_size + WORKSPACE_SIZE; // For A, B, C matrices plus workspace
if (free < required)
{
std::cerr << "Insufficient GPU memory for N=" << N
<< ", batch_size=" << batch_size
<< ". Required: " << required / (1 << 20)
<< " MB, Available: " << free / (1 << 20) << " MB"
<< std::endl;
return false;
}
return true;
}
void runBenchmark(int N, double theoretical_tflops, cudaStream_t computeStream,
cudaStream_t transferStream, cublasHandle_t cublasHandle)
{
size_t bytes = N * N * sizeof(__half);
int batch_size = (N <= 4096) ? BATCH_SIZE : 1; // Use larger batch for smaller matrices
if (!checkMemoryAvailability(bytes, N, batch_size))
{
std::cerr << "Skipping benchmark for N=" << N
<< " due to insufficient memory" << std::endl;
return;
}
// Allocate pinned host memory.
__half* h_A, * h_B, * h_C;
checkCuda(cudaHostAlloc(&h_A, bytes * batch_size, cudaHostAllocDefault), "alloc pinned A");
checkCuda(cudaHostAlloc(&h_B, bytes * batch_size, cudaHostAllocDefault), "alloc pinned B");
checkCuda(cudaHostAlloc(&h_C, bytes * batch_size, cudaHostAllocDefault), "alloc pinned C");
// Initialize host data.
for (int b = 0; b < batch_size; ++b)
{
for (int i = 0; i < N * N; ++i)
{
h_A[b * N * N + i] = __float2half(1.0f);
h_B[b * N * N + i] = __float2half(1.0f);
}
}
// Allocate device memory.
__half* d_A, * d_B, * d_C;
checkCuda(cudaMalloc(&d_A, bytes * batch_size), "alloc A");
checkCuda(cudaMalloc(&d_B, bytes * batch_size), "alloc B");
checkCuda(cudaMalloc(&d_C, bytes * batch_size), "alloc C");
// Asynchronous memory transfers.
cudaEvent_t transferDone;
checkCuda(cudaEventCreate(&transferDone), "create transfer done event");
checkCuda(cudaMemcpyAsync(d_A, h_A, bytes * batch_size, cudaMemcpyHostToDevice, transferStream), "copy A");
checkCuda(cudaMemcpyAsync(d_B, h_B, bytes * batch_size, cudaMemcpyHostToDevice, transferStream), "copy B");
checkCuda(cudaEventRecord(transferDone, transferStream), "record transfer done");
// Create cuBLASLt handle and set computestream.
cublasLtHandle_t ltHandle;
checkCublas(cublasLtCreate(<Handle), "create Lt handle");
checkCublas(cublasSetStream(cublasHandle, computeStream), "set stream");
// Create matrix multiplication descriptor.
cublasLtMatmulDesc_t matmulDesc;
checkCublas(cublasLtMatmulDescCreate(&matmulDesc, CUBLAS_COMPUTE_16F, CUDA_R_16F), "create matmul desc");
cublasOperation_t opN = CUBLAS_OP_N;
checkCublas(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opN, sizeof(opN)), "set trans B");
checkCublas(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &opN, sizeof(opN)), "set trans A");
cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;
checkCublas(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)), "set epilogue");
// Create matrix layout descriptors.
cublasLtMatrixLayout_t layoutA, layoutB, layoutC;
checkCublas(cublasLtMatrixLayoutCreate(&layoutA, CUDA_R_16F, N, N, N), "layout A");
checkCublas(cublasLtMatrixLayoutCreate(&layoutB, CUDA_R_16F, N, N, N), "layout B");
checkCublas(cublasLtMatrixLayoutCreate(&layoutC, CUDA_R_16F, N, N, N), "layout C");
__half alpha = __float2half(1.0f);
__half beta = __float2half(0.0f);
// Create and set matmul preference.
cublasLtMatmulPreference_t preferenceDesc;
checkCublas(cublasLtMatmulPreferenceCreate(&preferenceDesc), "create preference");
size_t workspaceSize = WORKSPACE_SIZE;
checkCublas(cublasLtMatmulPreferenceSetAttribute(preferenceDesc,
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)),
"set workspace size");
// Get heuristic results for algorithm selection.
cublasLtMatmulHeuristicResult_t heuristicResults[16];
int returnedResults = 0;
checkCublas(cublasLtMatmulAlgoGetHeuristic(ltHandle, matmulDesc, layoutA, layoutB, layoutC, layoutC,
preferenceDesc, 16, heuristicResults, &returnedResults),
"get heuristic");
if (returnedResults == 0)
{
std::cerr << "No suitable algorithm found!\n";
exit(1);
}
// Allocate workspace on device.
void* d_workspace;
checkCuda(cudaMalloc(&d_workspace, WORKSPACE_SIZE), "alloc workspace");
// Select the best algorithm via warmup iterations.
float minTime = std::numeric_limits<float>::max();
int bestAlgoIdx = 0;
cudaEvent_t start, stop;
checkCuda(cudaEventCreate(&start), "create start event");
checkCuda(cudaEventCreate(&stop), "create stop event");
for (int algoIdx = 0; algoIdx < returnedResults; ++algoIdx)
{
float totalTime = 0.0f;
// Warmup iterations.
for (int i = 0; i < WARMUP; ++i)
{
checkCuda(cudaStreamWaitEvent(computeStream, transferDone, 0), "wait for transfer");
for (int b = 0; b < batch_size; ++b)
{
checkCublas(cublasLtMatmul(ltHandle, matmulDesc, &alpha,
d_A + b * N * N, layoutA,
d_B + b * N * N, layoutB,
&beta, d_C + b * N * N, layoutC,
d_C + b * N * N, layoutC,
&heuristicResults[algoIdx].algo,
d_workspace, WORKSPACE_SIZE,
computeStream),
"warmup LtMatmul");
}
}
// Timed iterations.
for (int i = 0; i < REPEAT; ++i)
{
checkCuda(cudaStreamWaitEvent(computeStream, transferDone, 0), "wait for transfer");
checkCuda(cudaEventRecord(start, computeStream), "record start");
for (int b = 0; b < batch_size; ++b)
{
checkCublas(cublasLtMatmul(ltHandle, matmulDesc, &alpha,
d_A + b * N * N, layoutA,
d_B + b * N * N, layoutB,
&beta, d_C + b * N * N, layoutC,
d_C + b * N * N, layoutC,
&heuristicResults[algoIdx].algo,
d_workspace, WORKSPACE_SIZE,
computeStream),
"benchmark LtMatmul");
}
checkCuda(cudaEventRecord(stop, computeStream), "record stop");
checkCuda(cudaEventSynchronize(stop), "synchronize stop");
float ms;
checkCuda(cudaEventElapsedTime(&ms, start, stop), "elapsed time");
totalTime += ms;
}
float avgTime = totalTime / REPEAT;
if (avgTime < minTime)
{
minTime = avgTime;
bestAlgoIdx = algoIdx;
}
}
// Capture final multiplication loop with CUDA Graphs to reduce launch overhead.
cudaGraph_t graph;
cudaGraphExec_t graphExec;
checkCuda(cudaStreamBeginCapture(computeStream, cudaStreamCaptureModeGlobal), "begin capture");
for (int b = 0; b < batch_size; ++b)
{
checkCublas(cublasLtMatmul(ltHandle, matmulDesc, &alpha,
d_A + b * N * N, layoutA,
d_B + b * N * N, layoutB,
&beta, d_C + b * N * N, layoutC,
d_C + b * N * N, layoutC,
&heuristicResults[bestAlgoIdx].algo,
d_workspace, WORKSPACE_SIZE,
computeStream),
"graph LtMatmul");
}
checkCuda(cudaStreamEndCapture(computeStream, &graph), "end capture");
checkCuda(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0), "instantiate graph");
// Run captured graph and measure time.
float totalGraphTime = 0.0f;
for (int i = 0; i < REPEAT; ++i)
{
checkCuda(cudaEventRecord(start, computeStream), "record graph start");
checkCuda(cudaGraphLaunch(graphExec, computeStream), "launch graph");
checkCuda(cudaEventRecord(stop, computeStream), "record graph stop");
checkCuda(cudaEventSynchronize(stop), "synchronize graph stop");
float ms;
checkCuda(cudaEventElapsedTime(&ms, start, stop), "elapsed graph time");
totalGraphTime += ms;
}
float avgTimeGraph = totalGraphTime / REPEAT;
double ops = 2.0 * std::pow(N, 3) * batch_size; // Count FLOPs
double tflops = ops / (avgTimeGraph / 1000.0) / 1e12;
double percent = (tflops / theoretical_tflops) * 100.0;
std::cout << "\n[Tensor Core Benchmark Result]" << std::endl;
std::cout << " Matrix size: " << N << " x " << N << ", Batch size: " << batch_size << std::endl;
std::cout << " Time elapsed (avg): " << avgTimeGraph << " ms" << std::endl;
std::cout << " Performance: " << tflops << " TFLOPS" << std::endl;
std::cout << " Efficiency: " << percent << " %" << std::endl;
// Cleanup.
checkCuda(cudaFreeHost(h_A), "free pinned A");
checkCuda(cudaFreeHost(h_B), "free pinned B");
checkCuda(cudaFreeHost(h_C), "free pinned C");
checkCuda(cudaFree(d_A), "free A");
checkCuda(cudaFree(d_B), "free B");
checkCuda(cudaFree(d_C), "free C");
checkCuda(cudaFree(d_workspace), "free workspace");
checkCuda(cudaEventDestroy(start), "destroy start event");
checkCuda(cudaEventDestroy(stop), "destroy stop event");
checkCuda(cudaEventDestroy(transferDone), "destroy transfer done event");
checkCublas(cublasLtMatmulDescDestroy(matmulDesc), "destroy matmul desc");
checkCublas(cublasLtMatrixLayoutDestroy(layoutA), "destroy layout A");
checkCublas(cublasLtMatrixLayoutDestroy(layoutB), "destroy layout B");
checkCublas(cublasLtMatrixLayoutDestroy(layoutC), "destroy layout C");
checkCublas(cublasLtMatmulPreferenceDestroy(preferenceDesc), "destroy preference");
checkCublas(cublasLtDestroy(ltHandle), "destroy Lt handle");
checkCuda(cudaGraphDestroy(graph), "destroy graph");
checkCuda(cudaGraphExecDestroy(graphExec), "destroy graph exec");
}
int main()
{
cudaDeviceProp prop;
checkCuda(cudaGetDeviceProperties(&prop, 0), "get device properties");
std::cout << "[CUDA Device Info]" << std::endl;
std::cout << " Name: " << prop.name << std::endl;
std::cout << " CUDA Cores/SM: approx (arch-dep)" << std::endl;
std::cout << " SMs: " << prop.multiProcessorCount
<< ", Clock Rate: " << (prop.clockRate / 1000) << " MHz" << std::endl;
double theoretical_tflops = estimateTheoreticalTFLOPS(prop);
std::cout << " Estimated Peak FP16 Tensor Core TFLOPS: " << theoretical_tflops << std::endl;
cudaStream_t computeStream, transferStream;
checkCuda(cudaStreamCreate(&computeStream), "create compute stream");
checkCuda(cudaStreamCreate(&transferStream), "create transfer stream");
cublasHandle_t cublasHandle;
checkCublas(cublasCreate(&cublasHandle), "create cublas handle");
int sizes[] = { 1024, 2048, 4096, 8192, 16384 }; // Test various matrix sizes.
for (int N : sizes)
{
runBenchmark(N, theoretical_tflops, computeStream, transferStream, cublasHandle);
}
checkCublas(cublasDestroy(cublasHandle), "destroy cublas handle");
checkCuda(cudaStreamDestroy(computeStream), "destroy compute stream");
checkCuda(cudaStreamDestroy(transferStream), "destroy transfer stream");
return 0;
}