Cuda synchronisation is very long

Hi,

I’m trying to convert a directx12 RGBA float16 texture to BGRA float32 and flip vertically. To do that, I :

  • I create a cuda stream
cuStreamCreate(&cuStream, CU_STREAM_NON_BLOCKING)
  • map the texture memory
cudaExternalMemoryBufferDesc buffDesc{};
memset(&buffDesc, 0, sizeof(buffDesc));
buffDesc.offset = 0;
buffDesc.size = d3d12Texture.memSize;

m_memorySize = d3d12Texture.memSize;
const cudaError_t cudaError = cudaExternalMemoryGetMappedBuffer(&m_cudaExternalMemoryPtr, m_externalMemory, &buffDesc);
  • I create a second memory with same pixel size but in float32
m_memoryFloat32Size = sizeof(float) * numElements;
		cudaMalloc(&m_cudaExternalFloat32MemoryPtr, m_memoryFloat32Size);
  • I call my cuda script
// Define block and grid dimensions
	const dim3 blockDim(m_blockXDim, m_blockYDim, 1);

	const int xPixelGrid = blockDim.x * m_iPixelPerThread;
	const int yPixelGrid = blockDim.y * m_iPixelPerThread;
	const int xRest = (m_width % xPixelGrid) > 0 ? 1 : 0;
	const int yRest = (m_height % yPixelGrid) > 0 ? 1 : 0;

	const dim3 gridDim(m_width / xPixelGrid + xRest,
		m_height / yPixelGrid + yRest,
		1);
	
	void* arr[] = { &m_cudaExternalMemoryPtr, &m_cudaExternalFloat32MemoryPtr,  &m_width, &m_height, static_cast<void*>(&m_iPixelPerThread) };

	/*process cuda kernel convert texture float16 to flipped texture in float32*/
	checkCudaErrors_ret(cuLaunchKernel(cuFunction, 
		gridDim.x, gridDim.y,gridDim.z,					/* grid dim */
		blockDim.x, blockDim.y, blockDim.z,				/* block dim */
		0, cuStream,									/* shared mem, stream */
		&arr[0],										/* arguments */
		0), false)

cuda script is:

const int RGBA_size = 4;

/*
* @brief convert RGBA float16 image to RGBA float32, swap RED and BLUE
*/
extern "C" __global__ void convertFloat16ToFloat32Flip(const half * input, float* const output, const int width, const int height, const int maxTreat) {
    const int x = (blockIdx.x * blockDim.x + threadIdx.x) * maxTreat;
    const int y = (blockIdx.y * blockDim.y + threadIdx.y) * maxTreat;

    const int maxX = min(x + maxTreat, width);
    const int maxY = min(y + maxTreat, height);

    for(int posX = x; posX < maxX; ++posX)
    {
        for(int posY = y; posY < maxY; ++posY)
        {
            int position_src = ((posY * width) + posX) * RGBA_size;
            int position_dest = (((height - 1 - posY) * width) + posX) * RGBA_size;
            float* iter = output + position_dest;
            *iter =  	__half2float(input[position_src + 2]);
            *(++iter) = __half2float(input[position_src + 1]);
            *(++iter) = __half2float(input[position_src]);
            *(++iter) = __half2float(input[position_src + 3]);
            ++iter;
        }
    }
}

  • Before getting the result I call synchonize
cudaStreamSynchronize(cuStream)

I do this for several textures but I keep the same stream.
I 'm a noob with cuda.It works but synchronize takes a big amount of time.
Do you have an idea to optimize that ?
Thanks

synchronize takes a big amount of time because of the way CUDA works. A kernel launch is asynchronous. When you launch a kernel, and the follow with a cudaStreamSynchronize(), from the CPU thread perspective, all of the kernel execution time shows up in the duration of the cudaStreamSynchronize() call.

There is nothing to optimize there. If you want to optimize, focus on the kernel code.