I was to test my kernal bandwidth, but i find what ncu
measured was very different from what I measured.
My test code as follows, GPU is A800, CUDA version is 12.
#include <cuda_runtime.h>
#include <stdio.h>
#define BDIMX 256
#define BDIMY 4
__global__ void copyRow(float * in,float * out,int nx,int ny)
{
int ix=threadIdx.x+blockDim.x*blockIdx.x*4;
int iy=threadIdx.y+blockDim.y*blockIdx.y;
int idx=ix+iy*nx;
if (ix<nx && iy<ny)
{
out[idx]=in[idx];
out[idx+1*blockDim.x]=in[idx+1*blockDim.x];
out[idx+2*blockDim.x]=in[idx+2*blockDim.x];
out[idx+3*blockDim.x]=in[idx+3*blockDim.x];
}
}
int main(int argc,char** argv)
{
printf("strating...\n");
int nx=1<<12;
int ny=1<<12;
int dimx=BDIMX;
int dimy=BDIMY;
int nxy=nx*ny;
int nBytes=nxy*sizeof(float);
int transform_kernel=0;
//Malloc
float* A_host=(float*)malloc(nBytes);
float* B_host_cpu=(float*)malloc(nBytes);
float* B_host=(float*)malloc(nBytes);
initialData(A_host,nxy);
//cudaMalloc
float *A_dev=NULL;
float *B_dev=NULL;
CHECK(cudaMalloc((void**)&A_dev,nBytes));
CHECK(cudaMalloc((void**)&B_dev,nBytes));
CHECK(cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice));
CHECK(cudaMemset(B_dev,0,nBytes));
// 2d block and 2d grid
dim3 block(dimx,dimy);
dim3 grid_1((nx-1)/(block_1.x*4)+1,(ny-1)/block_1.y+1);
copyRow<<<grid_1,block>>>(A_dev,B_dev,nx,ny);
CHECK(cudaDeviceSynchronize());
CHECK(cudaMemcpy(B_host,B_dev,nBytes,cudaMemcpyDeviceToHost));
cudaFree(A_dev);
cudaFree(B_dev);
free(A_host);
free(B_host);
free(B_host_cpu);
cudaDeviceReset();
return 0;
}
The kernel time measured by nsys
is 91009 ns.
[6/8] Executing 'cuda_gpu_kern_sum' stats report
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- -------- -------- -------- -------- ----------- -----------------------------------
100.0 91009 1 91009.0 91009.0 91009 91009 0.0 copyRow(float *, float *, int, int)
[7/8] Executing 'cuda_gpu_mem_time_sum' stats report
Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation
-------- --------------- ----- ---------- ---------- -------- -------- ----------- ------------------
81.2 34004415 1 34004415.0 34004415.0 34004415 34004415 0.0 [CUDA memcpy DtoH]
18.7 7816603 1 7816603.0 7816603.0 7816603 7816603 0.0 [CUDA memcpy HtoD]
0.1 38272 1 38272.0 38272.0 38272 38272 0.0 [CUDA memset]
[8/8] Executing 'cuda_gpu_mem_size_sum' stats report
Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation
---------- ----- -------- -------- -------- -------- ----------- ------------------
67.109 1 67.109 67.109 67.109 67.109 0.000 [CUDA memcpy DtoH]
67.109 1 67.109 67.109 67.109 67.109 0.000 [CUDA memcpy HtoD]
67.109 1 67.109 67.109 67.109 67.109 0.000 [CUDA memset]
So the bandwidth = 67.108 * 2 / 1000 / 0.000091009 = 1474.77GB/s.
But the bandwidth measured by ncu
as follows:
Using device 0: NVIDIA A800 80GB PCIe
CPU Execution Time elapsed 0.488664 sec
==PROF== Profiling "copyRow" - 0: 0%....50%....100% - 1 pass
copyRow Time elapsed 0.588242 sec
Results don't match!
115.000000(gpu_result[1] )!= 126.000000(cpu_result[1])
==PROF== Disconnected from process 3128802
[3128802] a.out@127.0.0.1
copyRow(float *, float *, int, int) (4, 1024, 1)x(256, 4, 1), Context 1, Stream 7, Device 0, CC 8.0
Section: Command line profiler metrics
-------------------------------- ------------ ------------
Metric Name Metric Unit Metric Value
-------------------------------- ------------ ------------
dram__bytes_read.sum.per_second Gbyte/second 729.49
dram__bytes_write.sum.per_second Gbyte/second 594.29
-------------------------------- ------------ ------------
The bandwidth tested by ncu
is 729.49+594.29 = 1323.78GB/s
1323.78 / 1474.77 = 89.76%, this is not a negligible error, What went wrong?