Computation efficiency of a Quadro P620 Nvidia card

                                                                      December 28, 2021

Hello!

I am trying to understand the computation efficiency of my Quadro P620 Nvidia card. For that purpose, I wrote the following test program efficiency.cu.

#include “cuda_runtime.h”
#include “device_launch_parameters.h”

#include <stdio.h>
#include

#define N 1610241024
#define nMAD 100

global void cuda_saxpy(float a, float x, float y){
int i=blockIdx.x
blockDim.x+threadIdx.x;
y[i]=a
x[i]+y[i];
}

global void cuda_saipb(float a, float b, float* y){
int i=blockIdx.xblockDim.x+threadIdx.x;
y[i]=a
i+b;
}

global void cuda_algf(float *a, float b, float c){
int i=blockIdx.x
blockDim.x+threadIdx.x;
float va=a[i], vb=b[i], vc=c[i];
for (int k=0; k<nMAD; ++k) { vc=vc+va
vb; }
c[i]=vc;
//printf(“Device: Hello from blockIdx.x=%d threadIdx.x=%d, c[i=%d]=%f\n”,blockIdx.x,threadIdx.x,i,c[i]);
}

int main() {
std::cout << “Host: Good morning. N=” << N << " nMAD=" << nMAD << “\n”;
double vN=N, vnMAD=nMAD;
float* a = new float[N]; float* b = new float[N]; float* c = new float[N];
int size=N * sizeof(float);
cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop);
for (int i = 0; i < N; ++i) { a[i] =0.5f ; b[i] =1.f; c[i]=0.f; }
float* d_a = nullptr; cudaMalloc((void**)&d_a, size);
float* d_b = nullptr; cudaMalloc((void**)&d_b, size);
float* d_c = nullptr; cudaMalloc((void**)&d_c, size);
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, size, cudaMemcpyHostToDevice);
int nt=256;

cudaEventRecord(start);
//cuda_saxpy<<<1,2>>>(2.f,d_b, d_c);
cuda_saxpy<<<N/nt,nt>>>(2.f,d_b, d_c);
cudaDeviceSynchronize(); 
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop); 
double GFLOPSsaxpy=2.*vN/(milliseconds*1e6); 
double BWsaxpy=vN*4.*3./(milliseconds*1e6); 

for (int i = 0; i < N; ++i) { c[i]=0.f; }
cudaMemcpy(d_c, c, size, cudaMemcpyHostToDevice);
cudaEventRecord(start);
//cuda_saipb<<<1,2>>>(0.1f,0.01f,d_c);
cuda_saipb<<<N/nt,nt>>>(0.1f,0.01f,d_c);
cudaDeviceSynchronize(); 
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop); 
double GFLOPSsaipb=2.*vN/(milliseconds*1e6); 

for (int i = 0; i < N; ++i) { c[i]=0.f; }
cudaMemcpy(d_c, c, size, cudaMemcpyHostToDevice);
cudaEventRecord(start);
//cuda_algf<<<1,2>>>(d_a,d_b,d_c); 
cuda_algf<<<N/nt,nt>>>(d_a,d_b,d_c); 
cudaDeviceSynchronize(); 
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop); 
double GFLOPSalgf=vnMAD*2.*vN/(milliseconds*1e6); 

cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; ++i) {if (i%5000000==1) 
    std::cout << "i=" << i << " a=" << a[i] << " b=" << b[i] << " c=" << c[i] << std::endl;;
}

cudaEventDestroy(start); cudaEventDestroy(stop);
//free(a); free(b); free(c); 
//cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); 
std::cout << "GFLOPS saxpy=" << GFLOPSsaxpy << "\t" << "GFLOPS saipb=" << GFLOPSsaipb <<"\n"; 
std::cout << "GFLOPS algf=" << GFLOPSalgf <<"\n"; 
printf("Effective Bandwidth saxpy (GB/s)= %f\n",BWsaxpy); 
printf("Theoretical Bandwidth Quadro P620 (GB/s)= %f\n",3004.e6*(128./8.)*2./(1.e9)); 
std::cout << "Host: Bye\n";
return 0;

}

where the named constant value nMAD allows for repeating nMAD times the Multiple and Add operations of the kernel cuda_algf so that the computation time far exceeds the duration of the memory transfers.

Please, could someone explain to me why the numbers of GFLOPs of the kernels cuda_saxpy and cuda_saipb seem to be much too small with respect to the theoretical memory bandwidth as long as my assessment to be checked of the latter is correct?

A satisfactory number of GFLOP is only obtained with the kernel cuda_algf which repeats multiply and add operations in an arbitrary way.

Thanks in advance for your help.
Best regards,
Pascal