Based on the parallel for all blog post http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/ about an efficient matrix transpose I set up my own copy kernel to use as a benchmark when measuring performance of other kernels.
The first two kernels I measured were for converting between single channel byte (unsigned char) and float images. I was surprised to see that the kernel for converting from float to byte Convert_32f_8u gets a bandwidth around 70% of copy kernel whereas the kernel for converting byte to float Convert_8u_32f was more like 40%. Can anyone spot anything obvious in the kernels below that would suggest why this should be? The operations involved are nearly identical.
#define TILE_DIM 32
#define BLOCK_ROWS 4
struct CUDAImageBase
{
UINT nWidth;
UINT nHeight;
UINT nPitch;
void* pData;
};
///////////////////////////////////////////////////////////////////////////////////////////////
__global__ void CopyTest(const CUDAImageBase srcImage, CUDAImageBase destImage)
{
// Work out the location of the pixel being accessed by this thread
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
// Ensure x is within image bounds
if (x < srcImage.nWidth)
{
const float* pSrcData = (float*)((unsigned char *)srcImage.pData + y * srcImage.nPitch) + x;
float* pDestData = (float*)((unsigned char *)destImage.pData + y * destImage.nPitch) + x;
int nSrcOffset = srcImage.nPitch * (BLOCK_ROWS / sizeof(float));
int nDestOffset = destImage.nPitch * (BLOCK_ROWS / sizeof(float));
// Loop for each row
int iMax = min((int)TILE_DIM, (int)srcImage.nHeight - y);
for (int i = 0; i < iMax; i += BLOCK_ROWS)
{
// Copy data
pDestData[0] = pSrcData[0];
// Move on to next row
pSrcData += nSrcOffset;
pDestData += nDestOffset;
}
}
} // CopyTest
///////////////////////////////////////////////////////////////////////////////////////////////
__global__ void Convert_8u_32f(const CUDAImageBase srcImage,
CUDAImageBase destImage,
const float fScale,
const float fOffset)
{
// Work out the location of the pixel being accessed by this thread
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
// Ensure x is within image bounds
if (x < srcImage.nWidth)
{
const unsigned char* pSrcData = ((unsigned char *)srcImage.pData + y * srcImage.nPitch) + x;
float* pDestData = (float*)((unsigned char *)destImage.pData + y * destImage.nPitch) + x;
int nSrcOffset = srcImage.nPitch * BLOCK_ROWS;
int nDestOffset = destImage.nPitch * (BLOCK_ROWS / sizeof(float));
// Loop for each row
int iMax = min((int)TILE_DIM, (int)srcImage.nHeight - y);
for (int i = 0; i < iMax; i += BLOCK_ROWS)
{
// Copy data
pDestData[0] = __uint2float_rn(pSrcData[0]) * fScale + fOffset;
// Move on to next row
pSrcData += nSrcOffset;
pDestData += nDestOffset;
}
}
} // Convert_8u_32f
///////////////////////////////////////////////////////////////////////////////////////////////
__global__ void Convert_32f_8u(const CUDAImageBase srcImage,
CUDAImageBase destImage,
const float fScale,
const float fOffset)
{
// Work out the location of the pixel being accessed by this thread
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
// Ensure x is within image bounds
if (x < srcImage.nWidth)
{
const float* pSrcData = (float*)((unsigned char *)srcImage.pData + y * srcImage.nPitch) + x;
unsigned char* pDestData = ((unsigned char *)destImage.pData + y * destImage.nPitch) + x;
int nSrcOffset = srcImage.nPitch * (BLOCK_ROWS / sizeof(float));
int nDestOffset = destImage.nPitch * BLOCK_ROWS;
// Loop for each row
int iMax = min((int)TILE_DIM, (int)srcImage.nHeight - y);
for (int i = 0; i < iMax; i += BLOCK_ROWS)
{
// Copy data
pDestData[0] = min(255, __float2uint_rn(pSrcData[0] * fScale + fOffset));
// Move on to next row
pSrcData += nSrcOffset;
pDestData += nDestOffset;
}
}
} // Convert_32f_8u
I’m testing on SM 3.0 cards so my understanding is that the alignment shouldn’t be a big deal. All the kernels are fast (and appear to function correctly) but the discrepancy in speeds is confusing me so any suggestions would be appreciated.
Thanks,
Ewan