My problem is the output number of “nvprof -m dram_read_bytes” has small error compared to theoretical value. And the error can be much larger when there is other --metrics option in the command.
My device: V100-SXM2 16GB
nvidia driver version: 410.104
cuda version: 10.0
nvprof version: Release version 10.0.130 (21)
nvcc version: release 10.0, V10.0.130
Following is my full codes, a simple add kernel of two 1 million int vector.
#include <cuda_runtime.h>
#include <iostream>
using std::cout;
using std::endl;
#define CUDA_CHECK(x) \
{ cudaError_t cuda_error = x; \
if (cuda_error != cudaSuccess) \
cout << "cudaError_t: " << cuda_error << " != 0 " \
<< cudaGetErrorString(cuda_error) << endl; \
}
#define LEN 1000000
// kernel functions
template<typename Dtype>
__global__ void add_kernel(const int N, const Dtype* a, const Dtype* b, Dtype* c){
for (int i = threadIdx.x; i < N; i += gridDim.x * blockDim.x){
c[i] = a[i] + b[i];
}
}
int main(){
// host memory malloc & initial
int* host_a = new int[LEN];
int* host_b = new int[LEN];
int* host_c = new int[LEN];
for (int i = 0; i < LEN; ++i){
host_a[i] = i;
host_b[i] = i * 100;
host_c[i] = -1;
}
// GPU device start
int device_id = 0;
CUDA_CHECK(cudaSetDevice(device_id));
cout << "Using GPU " << device_id << "." << endl;
// cudaMalloc & cudaMemcpy & cudaMemset
int* dev_a;
int* dev_b;
int* dev_c;
CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(int)));
CUDA_CHECK(cudaMalloc((void**)&dev_b, LEN * sizeof(int)));
CUDA_CHECK(cudaMalloc((void**)&dev_c, LEN * sizeof(int)));
CUDA_CHECK(cudaMemcpy(dev_a, host_a, LEN * sizeof(int), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dev_b, host_b, LEN * sizeof(int), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemset(dev_c, 0, LEN * sizeof(int)));
dim3 grid_dim(1, 1, 1);
dim3 block_dim(128, 1, 1);
add_kernel<int><<<grid_dim, block_dim>>>(LEN, dev_a, dev_b, dev_c);
CUDA_CHECK(cudaMemcpy(host_c, dev_c, LEN * sizeof(int), cudaMemcpyDeviceToHost));
// Free gpu memory & free cpu memory
CUDA_CHECK(cudaFree(dev_a));
CUDA_CHECK(cudaFree(dev_b));
CUDA_CHECK(cudaFree(dev_c));
delete[] host_a;
delete[] host_b;
delete[] host_c;
return 0;
}
Build the code using:
nvcc add.cu
I profile ./a.out:
nvprof -m dram_read_bytes ./a.out
Using GPU 0.
==309850== Profiling application: ./a.out
==309850== Profiling result:
==309850== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "Tesla V100-SXM2-16GB (0)"
Kernel: void add_kernel<int>(int, int const *, int const *, int*)
1 dram_read_bytes Total bytes read from DRAM to L2 cache 7999936 7999936 7999936
This is the first problem: every time I execute “nvprof -m dram_read_bytes ./a.out”, the output number of dram_read_bytes changes around 8000000 with negligible error.(I tried dozens of times.)
Then, I profile ./a.out with other –metrics option chosen along with dram_read_bytes.
nvprof -m dram_read_bytes -m sysmem_read_bytes ./a.out
Using GPU 0.
==309850== Profiling application: ./a.out
==309850== Profiling result:
==309850== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "Tesla V100-SXM2-16GB (0)"
Kernel: void add_kernel<int>(int, int const *, int const *, int*)
1 dram_read_bytes Total bytes read from DRAM to L2 cache 7407200 7407200 7407200
1 sysmem_read_bytes System Memory Read Bytes 0 0 0
Here is the second problem: the output number of dram_read_bytes become much smaller the theoretical value 8000000. (I tried dozens of times.)
Is this a bug or does anyone knows the reason about it?