Simple parallel code behaves oddly with concurrent kernels

Hi all,

I am a sort of newbie CUDA programmer. I’ve put my hands at work on a Fermi (2050) card and started experimenting with it.

I came across the concurrent kernel execution part of the programming guide and wanted to see what kind of performance improvements I could get rewriting part of my kernels.

I started from my simplest one, a complex vector to complex vector sum.

The non-concurrent version of my code is as follows:

//kernel

__global__ void

VectorVectorSumKernelCC_O(const float2* aIn1,

						 const float2* aIn2,

						 float2* aOut,

						 const unsigned int aSize) {

  const unsigned int myPos = blockIdx.x * blockDim.x + threadIdx.x;

  if (myPos < aSize) {

	aOut[myPos].x = aIn1[myPos].x + aIn2[myPos].x;

	aOut[myPos].y = aIn1[myPos].y + aIn2[myPos].y;

  }

}

#define BLOCK_SIZE (1<<9)

//host

__host__ cudaError_t CudaSumVecVecCC_O( const float2* aIn1,

										const float2* aIn2,

										float2* aOut,

										const unsigned int aSize) {

	dim3 myThreads(BLOCK_SIZE);

	dim3 myGrid((aSize + BLOCK_SIZE - 1) / BLOCK_SIZE);

	cudaThreadSynchronize();

	VectorVectorSumKernelCC_O<<< myGrid, myThreads>>>(aIn1,

										aIn2,

										aOut,

										aSize);

	return cudaSuccess;

}

Leaving the kernel code untouched, I chose the following way to deploy concurrent kernels:

template<int N>

class CudaStream {

  public:

  CudaStream() {

	for( int i = 0; i < N; i++) {

	  cudaStreamCreate(&theStream[i]);

	}

  }

  ~CudaStream() {

	for( int i = 0; i < N; i++) {

	  cudaStreamDestroy(theStream[i]);

	}

  }

  cudaStream_t theStream[N];

};

#define BLOCK_SIZE (1<<9)

//host

__host__ cudaError_t CudaSumVecVecCC_O( const float2* aIn1,

										const float2* aIn2,

										float2* aOut,

										const unsigned int aSize) {

const size_t myNumKernels = (1 << 3);

  static CudaStream<myNumKernels> myStreams;

const size_t myChunkSize = aSize / myNumKernels;

dim3 myThreads(BLOCK_SIZE);

  dim3 myGrid((myChunkSize + BLOCK_SIZE - 1) / BLOCK_SIZE);

int myRealKernNumb = (aSize % myNumKernels != 0) ? myNumKernels + 1 : myNumKernels;

for(int i=0; i<myNumKernels; i++)

  {

	VectorVectorSumKernelCC_O<<<myGrid, myThreads, 0, myStreams.theStream[i]>>>(

	  &aIn1[i*myChunkSize],

	  &aIn2[i*myChunkSize],

	  &aOut[i*myChunkSize],

	  myChunkSize);

  }

  if( aSize % myNumKernels != 0 ) {

	int i = myRealKernNumb - 1;

	VectorVectorSumKernelCC_O<<<myGrid, myThreads, 0, myStreams.theStream[i]>>>(

	  &aIn1[i*myChunkSize],

	  &aIn2[i*myChunkSize],

	  &aOut[i*myChunkSize],

	  aSize - (i * myChunkSize) );

  }

return cudaSuccess;

Please notice that I have one more kernel invocation in case aSize is not multiple of myNumKernels.

The host function CudaSumVecVecCC_O is iteratively invoked within a performance test, with an increasing vectors’ size from 2^10 to 2^22, with powers of two steps, so the last kernel invocation won’t happen.

I put here the results of these performance tests, so that maybe someone can shed a bit of light upon them.

Non-Concurrent Execution

ADDCC[2^10] -> time:	0.0121729

ADDCC[2^11] -> time:	0.0125819

ADDCC[2^12] -> time:	0.0128622

ADDCC[2^13] -> time:	0.0167381

ADDCC[2^14] -> time:	0.0254805

ADDCC[2^15] -> time:	0.0253569

ADDCC[2^16] -> time:	0.0264088

ADDCC[2^17] -> time:	0.0401843

ADDCC[2^18] -> time:	0.0706934

ADDCC[2^19] -> time:	0.124931

ADDCC[2^20] -> time:	0.233733

ADDCC[2^21] -> time:	0.44601

ADDCC[2^22] -> time:	0.880258

Concurrent Execution, number of kernels: 1

ADDCC[2^10] -> time:	0.0104474

ADDCC[2^11] -> time:	0.0105187

ADDCC[2^12] -> time:	0.0104961

ADDCC[2^13] -> time:	0.0215177

ADDCC[2^14] -> time:	0.0240342

ADDCC[2^15] -> time:	0.0185507

ADDCC[2^16] -> time:	0.0205446

ADDCC[2^17] -> time:	0.0298381

ADDCC[2^18] -> time:	0.0570364

ADDCC[2^19] -> time:	0.111079

ADDCC[2^20] -> time:	0.219384

ADDCC[2^21] -> time:	0.435755

ADDCC[2^22] -> time:	0.86979

Concurrent Execution, number of kernels: 2

ADDCC[2^10] -> time:	0.0145335

ADDCC[2^11] -> time:	0.0145678

ADDCC[2^12] -> time:	0.0143913

ADDCC[2^13] -> time:	0.0145683

ADDCC[2^14] -> time:	0.0335424

ADDCC[2^15] -> time:	0.016053

ADDCC[2^16] -> time:	0.0212547

ADDCC[2^17] -> time:	0.0294669

ADDCC[2^18] -> time:	0.0566485

ADDCC[2^19] -> time:	0.110735

ADDCC[2^20] -> time:	0.219054

ADDCC[2^21] -> time:	0.435459

ADDCC[2^22] -> time:	0.869478

Concurrent Execution, number of kernels: 4

ADDCC[2^10] -> time:	0.0231104

ADDCC[2^11] -> time:	0.0232158

ADDCC[2^12] -> time:	0.0230844

ADDCC[2^13] -> time:	0.0231635

ADDCC[2^14] -> time:	0.0245262

ADDCC[2^15] -> time:	0.0242395

ADDCC[2^16] -> time:	0.0246462

ADDCC[2^17] -> time:	0.0320372

ADDCC[2^18] -> time:	0.0589968

ADDCC[2^19] -> time:	0.113202

ADDCC[2^20] -> time:	0.221437

ADDCC[2^21] -> time:	0.437941

ADDCC[2^22] -> time:	0.872315

Concurrent Execution, number of kernels: 8

ADDCC[2^10] -> time:	0.0390721

ADDCC[2^11] -> time:	0.0393887

ADDCC[2^12] -> time:	0.039282

ADDCC[2^13] -> time:	0.0428878

ADDCC[2^14] -> time:	0.0695331

ADDCC[2^15] -> time:	0.0433453

ADDCC[2^16] -> time:	0.0431187

ADDCC[2^17] -> time:	0.0422312

ADDCC[2^18] -> time:	0.0631554

ADDCC[2^19] -> time:	0.116901

ADDCC[2^20] -> time:	0.225481

ADDCC[2^21] -> time:	0.442002

ADDCC[2^22] -> time:	0.87626

Concurrent Execution, number of kernels: 16

ADDCC[2^10] -> time:	0.0721062

ADDCC[2^11] -> time:	0.0719587

ADDCC[2^12] -> time:	0.071891

ADDCC[2^13] -> time:	0.0730289

ADDCC[2^14] -> time:	0.0721654

ADDCC[2^15] -> time:	0.0731358

ADDCC[2^16] -> time:	0.0736931

ADDCC[2^17] -> time:	0.0755137

ADDCC[2^18] -> time:	0.0731187

ADDCC[2^19] -> time:	0.125743

ADDCC[2^20] -> time:	0.23309

ADDCC[2^21] -> time:	0.449978

ADDCC[2^22] -> time:	0.884631

Things to notice (and to explain, if possible):

    non-concurrent execution and concurrent execution with a 1-sized stream array behave differently;

    smaller cases get worse and worse as the number of concurrent kernel invocations grows;

    medium cases (2^15) have an optimal performance with 1 or 2 kernels, more than that is harmful;

    heaviest cases only get worse if executing concurrently.

Especially the last point leaves me a bit surprised. Is this further parallelization supposed to generally increase performances, by keeping all the processors as busy as possible, isn’it?

Thanks all!

M.