Computation efficiency of a Quadro P620 Nvidia card

phmh2.fries · December 28, 2021, 10:30am

                                                                      December 28, 2021

Hello!

I am trying to understand the computation efficiency of my Quadro P620 Nvidia card. For that purpose, I wrote the following test program efficiency.cu.

#include “cuda_runtime.h”
#include “device_launch_parameters.h”

#include <stdio.h>
#include

#define N 1610241024
#define nMAD 100

global void cuda_saxpy(float a, float x, float y){
int i=blockIdx.xblockDim.x+threadIdx.x;
y[i]=ax[i]+y[i];
}

global void cuda_saipb(float a, float b, float* y){
int i=blockIdx.xblockDim.x+threadIdx.x;
y[i]=ai+b;
}

global void cuda_algf(float *a, float b, float c){
int i=blockIdx.xblockDim.x+threadIdx.x;
float va=a[i], vb=b[i], vc=c[i];
for (int k=0; k<nMAD; ++k) { vc=vc+vavb; }
c[i]=vc;
//printf(“Device: Hello from blockIdx.x=%d threadIdx.x=%d, c[i=%d]=%f\n”,blockIdx.x,threadIdx.x,i,c[i]);
}

int main() {
std::cout << “Host: Good morning. N=” << N << " nMAD=" << nMAD << “\n”;
double vN=N, vnMAD=nMAD;
float* a = new float[N]; float* b = new float[N]; float* c = new float[N];
int size=N * sizeof(float);
cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop);
for (int i = 0; i < N; ++i) { a[i] =0.5f ; b[i] =1.f; c[i]=0.f; }
float* d_a = nullptr; cudaMalloc((void**)&d_a, size);
float* d_b = nullptr; cudaMalloc((void**)&d_b, size);
float* d_c = nullptr; cudaMalloc((void**)&d_c, size);
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, size, cudaMemcpyHostToDevice);
int nt=256;

cudaEventRecord(start);
//cuda_saxpy<<<1,2>>>(2.f,d_b, d_c);
cuda_saxpy<<<N/nt,nt>>>(2.f,d_b, d_c);
cudaDeviceSynchronize(); 
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop); 
double GFLOPSsaxpy=2.*vN/(milliseconds*1e6); 
double BWsaxpy=vN*4.*3./(milliseconds*1e6); 

for (int i = 0; i < N; ++i) { c[i]=0.f; }
cudaMemcpy(d_c, c, size, cudaMemcpyHostToDevice);
cudaEventRecord(start);
//cuda_saipb<<<1,2>>>(0.1f,0.01f,d_c);
cuda_saipb<<<N/nt,nt>>>(0.1f,0.01f,d_c);
cudaDeviceSynchronize(); 
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop); 
double GFLOPSsaipb=2.*vN/(milliseconds*1e6); 

for (int i = 0; i < N; ++i) { c[i]=0.f; }
cudaMemcpy(d_c, c, size, cudaMemcpyHostToDevice);
cudaEventRecord(start);
//cuda_algf<<<1,2>>>(d_a,d_b,d_c); 
cuda_algf<<<N/nt,nt>>>(d_a,d_b,d_c); 
cudaDeviceSynchronize(); 
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop); 
double GFLOPSalgf=vnMAD*2.*vN/(milliseconds*1e6); 

cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; ++i) {if (i%5000000==1) 
    std::cout << "i=" << i << " a=" << a[i] << " b=" << b[i] << " c=" << c[i] << std::endl;;
}

cudaEventDestroy(start); cudaEventDestroy(stop);
//free(a); free(b); free(c); 
//cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); 
std::cout << "GFLOPS saxpy=" << GFLOPSsaxpy << "\t" << "GFLOPS saipb=" << GFLOPSsaipb <<"\n"; 
std::cout << "GFLOPS algf=" << GFLOPSalgf <<"\n"; 
printf("Effective Bandwidth saxpy (GB/s)= %f\n",BWsaxpy); 
printf("Theoretical Bandwidth Quadro P620 (GB/s)= %f\n",3004.e6*(128./8.)*2./(1.e9)); 
std::cout << "Host: Bye\n";
return 0;

}

where the named constant value nMAD allows for repeating nMAD times the Multiple and Add operations of the kernel cuda_algf so that the computation time far exceeds the duration of the memory transfers.

Please, could someone explain to me why the numbers of GFLOPs of the kernels cuda_saxpy and cuda_saipb seem to be much too small with respect to the theoretical memory bandwidth as long as my assessment to be checked of the latter is correct?

A satisfactory number of GFLOP is only obtained with the kernel cuda_algf which repeats multiply and add operations in an arbitrary way.

Thanks in advance for your help.
Best regards,
Pascal

Topic		Replies	Views
Cuda code performance CUDA Programming and Performance	14	3136	December 16, 2014
Confused about GPU vs CPU speed in multiplication CUDA Programming and Performance	8	6546	February 19, 2009
Poor half performance CUDA Programming and Performance	9	2286	February 16, 2020
How to Implement Performance Metrics in CUDA C/C++ Technical Blog	20	862	March 11, 2020
Measuring FLOPS CUDA Programming and Performance	8	14164	January 19, 2010
Number of GPU clock cycles CUDA Programming and Performance	15	10258	June 16, 2017
Speed-up and bandwidth CUDA Programming and Performance	12	9779	May 4, 2008
Bug in gpuGetMaxGflopsDeviceId for CUDA Toolkit 10.0 CUDA Programming and Performance	7	943	September 26, 2018
Optimizing performance of a serial <<<1, 1>>> kernel, after long debugging hours CUDA Programming and Performance	13	887	July 2, 2018
GPU Perfomance How much GFlops??? CUDA Programming and Performance	27	37320	August 30, 2009

Computation efficiency of a Quadro P620 Nvidia card

Related topics