Hello everyone,
I have a confusion about where to put my iteration to make it run longer, and I want to know the difference between those 2 approaches:
- Iterations inside the global function: for example vector addition
__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
int nIter=1000;
for (int k =0; k < nIter ; k++)
{
if (i < numElements)
{
C[i] = A[i] + B[i];
}
__syncthreads();
}
}
- outside the global function, in my main like this :
int nIter=1000;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int j = 0; j < nIter; j++)
{
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements) ;
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
Q1: there is no overhead for the second approach ?
Q2: Or do you think it similar ?
Q3: which approach to adopt to be sure that I’m repeating my kernel ?
Thank you in advance for your response,
Dorra