Hi, I am trying to speed up my apllication through concurrent kernel execution. I am using an GTX 580, by when I increase the kernel number it will be slower. Has anybody an idea? Hier is my awful code :-)
int nkernels = 8;
int tile = N/nkernels;
int offset = blocks/nkernels;
int nstreams = nkernels + 1;
dim3 dimBlock(threads,1,1);
dim3 dimGrid(offset,1,1);
cudaStream_t *streams = (cudaStream_t*) malloc(nstreams * sizeof(cudaStream_t));
for(int i = 0; i < nstreams; i++)
cudaStreamCreate(&(streams[i]));
cudaEvent_t *kernelEvent;
kernelEvent = (cudaEvent_t*) malloc(nkernels * sizeof(cudaEvent_t));
for(int i = 0; i < nkernels; i++)
cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming);
for(int i = 0; i<nkernels; i++)
{
//reduction of one 1D array to an aarray of size(blocksize) final reduction on CPU
reduceKernel <<< dimGrid, dimBlock, 0, streams[i] >>> (&d_idata[(i*tile)], &d_odata[offset*i], tile );
cudaEventRecord(kernelEvent[i], streams[i]);
cudaStreamWaitEvent( streams[ nstreams-1 ], kernelEvent[i],0);
}
// release resources
for(int i = 0; i < nkernels; i++) {
cudaStreamDestroy(streams[i]);
cudaEventDestroy(kernelEvent[i]);
}
free(streams);
free(kernelEvent);
}
cudaMemcpy(h_tmp, d_odata, blocks*sizeof(computetype), cudaMemcpyDeviceToHost);
The result is fine but, computetime is slower than using one kernel launch.