I am trying to run a GPU memory bandwidth test using the simplest code possible:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define cudaErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line)
{
if (code != cudaSuccess) { fprintf(stderr, "cudaError: %s\n\tfile<\"%s\">\n\t\tline:%d \n", cudaGetErrorString(code), file, line); exit(code); }
}
__global__ void read(int* a, int* b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
b[tid] = a[tid];
}
int main()
{
const long long size = 1ll << 28;
const long long blockSize = 256;
int* d_a; cudaErrorCheck(cudaMalloc(&d_a, size * sizeof(int)));
int* d_b; cudaErrorCheck(cudaMalloc(&d_b, size * sizeof(int)));
read<<<size / blockSize, blockSize>>>(d_a, d_b);
cudaErrorCheck(cudaDeviceSynchronize());
cudaErrorCheck(cudaFree(d_a));
cudaErrorCheck(cudaFree(d_b));
cudaDeviceReset();
return 0;
}
I am benchmarking an RTX 2080 MaxQ graphics card. The weird thing is that no matter what kind of tweaking I do to this code I always hit a maximum memory bandwith of around 356-357 GB/sec.
Now … I did some calculations for the theoretical peak bandwidth that this card should have:
Memory clock: 1.5 GHz
Memory bus width: 256 bit
Number of Streaming Multiprocessors (SMs): 46
Number of Memory Controllers (MCs): 8
I should get: 1.5e9 Transactions/s * 256 bits/Transaction/MC * 8 MC* 1/8 Bytes/bits = 384e9 Bytes/s
Now that figure is 384 GB/s or 384e9/2^30 = 357.62 GiB/s !
That’s the ~92% peak memory bandwidth I’ve been unable to surpass.
I suspect that this is a bug in the NSight Compute report. The bandwidth that it reports is 357 GB/s, but I think the graph shows 92% of peak because, this time, it is using the 384 GB/s figure. What am I missing? Maybe you could run the benchmark and see if you get the same results?