Hi,
I’m trying to convert a directx12 RGBA float16 texture to BGRA float32 and flip vertically. To do that, I :
- I create a cuda stream
cuStreamCreate(&cuStream, CU_STREAM_NON_BLOCKING)
- map the texture memory
cudaExternalMemoryBufferDesc buffDesc{};
memset(&buffDesc, 0, sizeof(buffDesc));
buffDesc.offset = 0;
buffDesc.size = d3d12Texture.memSize;
m_memorySize = d3d12Texture.memSize;
const cudaError_t cudaError = cudaExternalMemoryGetMappedBuffer(&m_cudaExternalMemoryPtr, m_externalMemory, &buffDesc);
- I create a second memory with same pixel size but in float32
m_memoryFloat32Size = sizeof(float) * numElements;
cudaMalloc(&m_cudaExternalFloat32MemoryPtr, m_memoryFloat32Size);
- I call my cuda script
// Define block and grid dimensions
const dim3 blockDim(m_blockXDim, m_blockYDim, 1);
const int xPixelGrid = blockDim.x * m_iPixelPerThread;
const int yPixelGrid = blockDim.y * m_iPixelPerThread;
const int xRest = (m_width % xPixelGrid) > 0 ? 1 : 0;
const int yRest = (m_height % yPixelGrid) > 0 ? 1 : 0;
const dim3 gridDim(m_width / xPixelGrid + xRest,
m_height / yPixelGrid + yRest,
1);
void* arr[] = { &m_cudaExternalMemoryPtr, &m_cudaExternalFloat32MemoryPtr, &m_width, &m_height, static_cast<void*>(&m_iPixelPerThread) };
/*process cuda kernel convert texture float16 to flipped texture in float32*/
checkCudaErrors_ret(cuLaunchKernel(cuFunction,
gridDim.x, gridDim.y,gridDim.z, /* grid dim */
blockDim.x, blockDim.y, blockDim.z, /* block dim */
0, cuStream, /* shared mem, stream */
&arr[0], /* arguments */
0), false)
cuda script is:
const int RGBA_size = 4;
/*
* @brief convert RGBA float16 image to RGBA float32, swap RED and BLUE
*/
extern "C" __global__ void convertFloat16ToFloat32Flip(const half * input, float* const output, const int width, const int height, const int maxTreat) {
const int x = (blockIdx.x * blockDim.x + threadIdx.x) * maxTreat;
const int y = (blockIdx.y * blockDim.y + threadIdx.y) * maxTreat;
const int maxX = min(x + maxTreat, width);
const int maxY = min(y + maxTreat, height);
for(int posX = x; posX < maxX; ++posX)
{
for(int posY = y; posY < maxY; ++posY)
{
int position_src = ((posY * width) + posX) * RGBA_size;
int position_dest = (((height - 1 - posY) * width) + posX) * RGBA_size;
float* iter = output + position_dest;
*iter = __half2float(input[position_src + 2]);
*(++iter) = __half2float(input[position_src + 1]);
*(++iter) = __half2float(input[position_src]);
*(++iter) = __half2float(input[position_src + 3]);
++iter;
}
}
}
- Before getting the result I call synchonize
cudaStreamSynchronize(cuStream)
I do this for several textures but I keep the same stream.
I 'm a noob with cuda.It works but synchronize takes a big amount of time.
Do you have an idea to optimize that ?
Thanks