Execution timings varying from instance to instance

Hi,

I run the follow code and I realized that the execution time change and I have attached the logged timings. Does anyone know why this happened?

Example.cu (4.6 KB)
TIMINGS.csv (97.7 KB)

Please post code by pasting it into your question and use proper formatting. Do not use attachments to post a simple code file.

Please explain the specific aspects of the timing that you would like explained. There are likely multiple things happening.

The code I have used to calculate execution time is

include <stdio.h>
include <math.h>
include <npp.h>
include <cuda.h>
include <nppdefs.h>
include <cuda_runtime.h>
include <sys/time.h>

include
include <bits/stdc++.h>
using namespace std;

define IMAGE_HEIGHT 512
define IMAGE_WIDTH 640
define IMAGE_LENGTH (IMAGE_HEIGHT * IMAGE_WIDTH)
define E_ERR_FILE_OPEN -1
define ERR_NULL_POINTER -2
int DataRead(char* in_pcFileName, unsigned int in_uiImgHeight, unsigned int in_uiImgWidth, unsigned short* out_pusImg);

global void Min_Kernel(unsigned short* in_DpusOriginalImg, unsigned short* in_DpusMorphologyImg, unsigned int in_uiImagSize, unsigned short* out_DpusImg)
{
///Index values
unsigned int uiIdx = blockIdx.x * blockDim.x + threadIdx.x;

if(uiIdx < in_uiImagSize)
{
    if(in_DpusOriginalImg[uiIdx] < in_DpusMorphologyImg[uiIdx])
    {
        out_DpusImg[uiIdx] = in_DpusOriginalImg[uiIdx];
    }
    else
    {
        out_DpusImg[uiIdx] = in_DpusMorphologyImg[uiIdx];
    }
}

}

int Min_Gpu(unsigned short* in_DpusImg,unsigned short* in_DpusNewImg, unsigned int in_uiMaxHeight, unsigned int in_uiMaxWidth, unsigned short* out_DpusImg)
{
int iRetVal = 0;
unsigned short* DpusMinImg = NULL;
unsigned int uiImgSize = in_uiMaxHeight * in_uiMaxWidth;

///cuda Error object
cudaError_t cudaStatus;

///cuda Time variable
float fTime = 0.0;

///cuda Time objects  & Events
cudaEvent_t ceStart;
cudaEvent_t ceStop;

FILE *pfTimings = NULL;

pfTimings = fopen("MIN_TIMINGS.csv", "a");

cudaEventCreate(&ceStart);
cudaEventCreate(&ceStop);

///cuda Event Start
cudaEventRecord(ceStart, 0);

Min_Kernel<<<in_uiMaxHeight, in_uiMaxWidth>>>(in_DpusImg, in_DpusNewImg, uiImgSize, out_DpusImg);

/// Blocks until the device has completed all preceding requested tasks.
cudaDeviceSynchronize();
///	Check	for	any	errors	launching	the	kernel
#if 0
 cudaStatus	=	cudaGetLastError();
if(cudaStatus != cudaSuccess)
{
	fprintf(stderr,	"Min Kernel	launch	failed:	%s\n",	cudaGetErrorString(cudaStatus));
	iRetVal = E_ERR_MIN;
}
#endif

///cuda Event stop
cudaEventRecord(ceStop, 0);
cudaEventSynchronize(ceStop);
///Finding the Elapsed Time
cudaEventElapsedTime(&fTime,ceStart, ceStop);

if(pfTimings != NULL)
{
    fprintf(pfTimings,"%f \n", fTime);
    fflush(pfTimings);
    fclose(pfTimings);
}
///Destroy the cudaEvent objects
cudaEventDestroy(ceStart);
cudaEventDestroy(ceStop);

return iRetVal;

}

int main()
{
int iRetVal = 0;
int iLoop = 0 ;
char carrFileName[100] = {0};

unsigned short *DpusImg = NULL;
unsigned short *DpusMin = NULL;

cudaMallocManaged((void **)&DpusImg, (IMAGE_LENGTH * sizeof(unsigned short)));
cudaMallocManaged((void **)&DpusMin, (IMAGE_LENGTH * sizeof(unsigned short)));

sprintf(carrFileName, "Frame-1.csv");
iRetVal = DataRead(carrFileName, IMAGE_HEIGHT, IMAGE_WIDTH, DpusImg);
for(iLoop = 0 ;iLoop <10000 ; iLoop ++)
{
	iRetVal = Min_Gpu(DpusImg, DpusImg, IMAGE_HEIGHT, IMAGE_WIDTH, DpusMin);
    if(iRetVal == 0)
    {
    	printf("min success\n");
    }
    else
    {
		printf("error in min");
    }
}
cudaFree(DpusImg);
cudaFree(DpusMin);

return iRetVal;

}

int DataRead(char* in_pcFileName, unsigned int in_uiImgHeight, unsigned int in_uiImgWidth, unsigned short* out_pusImg)
{
unsigned int uiHeightCnt = 0;
unsigned int uiWidthCnt = 0;
FILE *fp_RawImg = NULL;
int iRetVal = 0;

/// Inputs & Outputs pointers validation
if((out_pusImg == NULL) || (in_pcFileName == NULL))
{
    iRetVal = ERR_NULL_POINTER;
}
else
{
    /// Open the csv files in Reading Mode
    fp_RawImg = fopen(in_pcFileName,"r");

    ///Check the file pointers
    if(fp_RawImg == NULL)
    {
        printf(">>errore!\n");
        iRetVal = E_ERR_FILE_OPEN;
    }
    else
    {
        for(uiHeightCnt = 0; uiHeightCnt < in_uiImgHeight ; uiHeightCnt++)
        {
            for(uiWidthCnt = 0; uiWidthCnt < in_uiImgWidth ; uiWidthCnt++)
            {
                /// Read Image data from file
                fscanf(fp_RawImg,"%hu,",&out_pusImg[(uiHeightCnt * in_uiImgWidth) + uiWidthCnt]);
                
            }
            fscanf(fp_RawImg,"\n");
        }
        /// Close the csv files
        fclose(fp_RawImg);
    }
}
return iRetVal;

}

The code given was tested for 10000 times and I have observed that there is a variation in execution timings. I got minimum execution time of 0.0309ms and maximum of 0.7539 and it is varying for every execution. Can I know how can I overcome it?

Please put the whole code into a code section (</> in editor).

Execution of the first kernel is probably slower because the data in managed memory is transferred to the device. Subsequent kernels have no transfers because the data already present on the device.

The code used is

#include <stdio.h>
#include <math.h>
#include <npp.h>
#include <cuda.h>
#include <nppdefs.h> 
#include <cuda_runtime.h>
#include <sys/time.h>

#include <iostream>
#include <bits/stdc++.h>
using namespace std;


#define IMAGE_HEIGHT	512
#define IMAGE_WIDTH		640
#define IMAGE_LENGTH	(IMAGE_HEIGHT * IMAGE_WIDTH)
#define  E_ERR_FILE_OPEN -1
#define  ERR_NULL_POINTER -2
int DataRead(char* in_pcFileName, unsigned int in_uiImgHeight, unsigned int in_uiImgWidth, unsigned short* out_pusImg);

__global__ void Min_Kernel(unsigned short* in_DpusOriginalImg, unsigned short* in_DpusMorphologyImg, unsigned int in_uiImagSize, unsigned short* out_DpusImg)
{
    ///Index values
    unsigned int  uiIdx = blockIdx.x * blockDim.x + threadIdx.x;

    if(uiIdx < in_uiImagSize)
    {
        if(in_DpusOriginalImg[uiIdx] < in_DpusMorphologyImg[uiIdx])
        {
            out_DpusImg[uiIdx] = in_DpusOriginalImg[uiIdx];
        }
        else
        {
            out_DpusImg[uiIdx] = in_DpusMorphologyImg[uiIdx];
        }
    }
}

int Min_Gpu(unsigned short* in_DpusImg,unsigned short* in_DpusNewImg, unsigned int in_uiMaxHeight, unsigned int in_uiMaxWidth, unsigned short* out_DpusImg)
{	
	int iRetVal = 0;
	unsigned short* DpusMinImg = NULL;
	unsigned int uiImgSize = in_uiMaxHeight * in_uiMaxWidth;

	///cuda Error object
	cudaError_t cudaStatus;

    ///cuda Time variable
    float fTime = 0.0;
    
    ///cuda Time objects  & Events
    cudaEvent_t ceStart;
    cudaEvent_t ceStop;

    FILE *pfTimings = NULL;

    pfTimings = fopen("MIN_TIMINGS.csv", "a");

    cudaEventCreate(&ceStart);
    cudaEventCreate(&ceStop);

    ///cuda Event Start
    cudaEventRecord(ceStart, 0);

	Min_Kernel<<<in_uiMaxHeight, in_uiMaxWidth>>>(in_DpusImg, in_DpusNewImg, uiImgSize, out_DpusImg);
	
	/// Blocks until the device has completed all preceding requested tasks.
	cudaDeviceSynchronize();
	///	Check	for	any	errors	launching	the	kernel
	#if 0
	 cudaStatus	=	cudaGetLastError();
	if(cudaStatus != cudaSuccess)
	{
		fprintf(stderr,	"Min Kernel	launch	failed:	%s\n",	cudaGetErrorString(cudaStatus));
		iRetVal = E_ERR_MIN;
	}
	#endif

    ///cuda Event stop
    cudaEventRecord(ceStop, 0);
    cudaEventSynchronize(ceStop);
    ///Finding the Elapsed Time
    cudaEventElapsedTime(&fTime,ceStart, ceStop);

    if(pfTimings != NULL)
    {
        fprintf(pfTimings,"%f \n", fTime);
        fflush(pfTimings);
        fclose(pfTimings);
    }
    ///Destroy the cudaEvent objects
    cudaEventDestroy(ceStart);
    cudaEventDestroy(ceStop);

	return iRetVal;
}	 


int main()
{
    int	iRetVal = 0;
    int	 iLoop = 0 ;
    char	carrFileName[100] = {0};
    
    unsigned short *DpusImg = NULL;
    unsigned short *DpusMin = NULL;

    cudaMallocManaged((void **)&DpusImg, (IMAGE_LENGTH * sizeof(unsigned short)));
    cudaMallocManaged((void **)&DpusMin, (IMAGE_LENGTH * sizeof(unsigned short)));

	sprintf(carrFileName, "Frame-1.csv");
	iRetVal = DataRead(carrFileName, IMAGE_HEIGHT, IMAGE_WIDTH, DpusImg);
    for(iLoop = 0 ;iLoop <10000 ; iLoop ++)
    {
		iRetVal = Min_Gpu(DpusImg, DpusImg, IMAGE_HEIGHT, IMAGE_WIDTH, DpusMin);
        if(iRetVal == 0)
        {
        	printf("min success\n");
        }
        else
        {
			printf("error in min");
        }
    }
    cudaFree(DpusImg);
    cudaFree(DpusMin);

    return iRetVal;
}

int DataRead(char* in_pcFileName, unsigned int in_uiImgHeight, unsigned int in_uiImgWidth, unsigned short* out_pusImg)
{
    unsigned int uiHeightCnt = 0;
    unsigned int uiWidthCnt = 0;
    FILE *fp_RawImg = NULL;
    int iRetVal = 0;

    /// Inputs & Outputs pointers validation
    if((out_pusImg == NULL) || (in_pcFileName == NULL))
    {
        iRetVal = ERR_NULL_POINTER;
    }
    else
    {
        /// Open the csv files in Reading Mode
        fp_RawImg = fopen(in_pcFileName,"r");

        ///Check the file pointers
        if(fp_RawImg == NULL)
        {
            printf(">>errore!\n");
            iRetVal = E_ERR_FILE_OPEN;
        }
        else
        {
            for(uiHeightCnt = 0; uiHeightCnt < in_uiImgHeight ; uiHeightCnt++)
            {
                for(uiWidthCnt = 0; uiWidthCnt < in_uiImgWidth ; uiWidthCnt++)
                {
                    /// Read Image data from file
                    fscanf(fp_RawImg,"%hu,",&out_pusImg[(uiHeightCnt * in_uiImgWidth) + uiWidthCnt]);
                    
                }
                fscanf(fp_RawImg,"\n");
            }
            /// Close the csv files
            fclose(fp_RawImg);
        }
    }
    return iRetVal;
}

But in between also I’m getting spikes in execution time. You can check the timings I have recorded in the file I have attached TIMINGS.csv.

Your timing data suggest run-time variability (roughly between 0.028 and 0.036) but not what I would classify as spikes. The kernel code looks very much memory bandwidth limited. Noisy timing is common with memory-intensive codes in general (CPUs and GPUs), that is why a not uncommon benchmarking approach is to report fastest-out-of-ten-runs, which tends to be much more stable.

There may be other factors making these measurements noisy, such as dynamic clocking of the GPU and a presumed lack of memory affinity on the host side while managed memory is being used. It probably does not matter here, but I would use a cudaDeviceSynchronize() prior to starting the timer to make sure the GPU is idle at that point.

Again, it probably doesn’t matter, but I would declare the image pointers passed to the kernel as __restrict__.

Thank you for your advice. I will check it and get back to you.

I have tried "restrict " and “cudaDeviceSynchronize()” but there is no difference in timings. and the GPU I’m using is
GPU : NVIDIA® Quadro® P2000
Memory : 4GB GDDR5 memory, 128-bit (Bandwidth: 96 GB/s)
CUDA Cores : 768 CUDA® cores, 2.3 TFLOPS SP Peak
Compute API : Compute API CUDA Toolkit 8.0, CUDA Compute version 6.1, OpenCL™ 1.2
Graphics API : DirectX® 12, OpenGL 4.5, Vulcan 1.0
Display Outputs : 4x DisplayPort 1.4 digital video outputs, 4K at 120Hz or 5K at 60Hz
Interface : MXM 3.1, PCI Express Gen3 x16 support
Dimensions : 82 (W) x 70 (D) x 4.8 (H) mm
Form Factor : Standard MXM 3.1 Type A
Power Consumption : 58W
OS Support : Windows 10, Linux support by project

That jibes with what I said:

My point was that one should design timing experiments carefully, and that includes mitigating any known factors that could unnecessarily inflate execution times or lead to an increase measurement noise.

Getting stable execution measurements on any complex modern system requires one to tie down as many loose ends as possible. That includes locking CPU and GPU clock frequencies as much as possible, and if any host-side activity is involved, could require locking execution execution to a particular CPU core (processor affinity and memory affinity).

Even with that in place, memory subsystems are layered and involve many acceleration mechanisms so that the exact state of the subsystem is not known and / or difficult to reproduce exactly (e.g. which memory pages are currently kept open by a DRAM controller), which is why I suggested applying the best-run-of-ten principle: Measure ten times, and use the best time observed across the tens runs.