Hi, I used nvprof to profile a simple vecadd example (n=1024) on P100 but observed the dram_write_bytes is only 256 (rather than 1024*4 that I expected). Can someone explain why this number is small? What other metrics I need to add in to count for global memory writes? Thanks. float_count_sp number is correct (1024).
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
__global__ void vecAdd(float* a, float* b, float* c, int n){
int id = blockIdx.x*blockDim.x + threadIdx.x;
if(id < n) c[id] = a[id] + b[id];
}
int main(int argc, char* argv[]){
int n = 1024;
float *h_a, *d_a;
float *h_b, *d_b;
float *h_c, *d_c;
size_t bytes = n*sizeof(float);
h_a = (float*)malloc(bytes);
h_b = (float*)malloc(bytes);
h_c = (float*)malloc(bytes);
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
int i;
for(i = 0; i < n; i++){
h_a[i] = sin(i)*sin(i);
h_b[i] = cos(i)*cos(i+1);
}
cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
vecAdd <<<1, 1024>>> (d_a, d_b, d_c, n);
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
float sum = 0;
for(i = 0; i < n; i++)
sum += h_c[i] - h_a[i] - h_b[i];
printf("final diff: %f\n", sum/n);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);
return 0;
}
Is it related to the sampling of nvprof? One time I get 384 bytes. Sometimes I even got 0 bytes.