Hey all,
Just getting to grips with shared memory & using it effectively, my program is counting “warp serialize” values from the profiler tool & I’m not sure why. I’m using typical shared memory accesses like from the programming guide, and the model (array sizes & access patterns) seem to work when isolated into a test kernel without serializing. So as far as I can tell it’s a good model.
Excuse the over-use of #defines, thats both so i can swap between shared memory/register memory easy & so I can guarantee a single array-indexing pattern.
Block sizes are 16x16, grid size is 64x48.
I think it might be the shared dx/dy or srcx/srcy (and using them to index into the texture), but I just don’t know how that could be true if I’m always indexing into the array in the same way (which sometimes works).
The serialize value is very low, but enough to make the shared memory worthless, it’s actually quicker if all values are in global/register mem & the bilerp happens there. Maybe I’ve got the wrong concept for shared mem (again) & I’m using it in the wrong way, please tell me so! :)
__global__ void lens_correct_kernel ( unsigned char* imgSrc, uint imgSrcW, uint imgSrcH, uint imgSrcP,
unsigned char* imgDst, uint imgDstW, uint imgDstH, uint imgDstP, float* mapX, float* mapY )
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
__shared__ float _dx[LENS_BLOCK_DIM_X][LENS_BLOCK_DIM_Y+1];
__shared__ float _dy[LENS_BLOCK_DIM_X][LENS_BLOCK_DIM_Y+1];
#define dx _dx[ threadIdx.y ][ threadIdx.x ]
#define dy _dy[ threadIdx.y ][ threadIdx.x ]
dx = tex2D ( mapXTex, x, y );
dy = tex2D ( mapYTex, x, y );
__shared__ int2 _srcxy[LENS_BLOCK_DIM_X][LENS_BLOCK_DIM_Y+1];
_srcxy[ threadIdx.y ][ threadIdx.x ] = make_int2 ( (int)dx, (int)dy );
#define SRCX _srcxy[ threadIdx.y ][ threadIdx.x ].x
#define SRCY _srcxy[ threadIdx.y ][ threadIdx.x ].y
dx = dx - (float)SRCX;
dy = dy - (float)SRCY;
#define lookup(X,Y,P) ( (Y) * P ) + ( (X) * 4 )
// bounds check for bilerp
if (( (SRCX + 1) < imgSrcW ) && ( (SRCY + 1) < imgSrcH ))
{
__shared__ uchar4 _x0y0 [LENS_BLOCK_DIM_X][LENS_BLOCK_DIM_Y+1];
__shared__ uchar4 _x0y1 [LENS_BLOCK_DIM_X][LENS_BLOCK_DIM_Y+1];
__shared__ uchar4 _x1y0 [LENS_BLOCK_DIM_X][LENS_BLOCK_DIM_Y+1];
__shared__ uchar4 _x1y1 [LENS_BLOCK_DIM_X][LENS_BLOCK_DIM_Y+1];
__shared__ uchar4 _result [LENS_BLOCK_DIM_X][LENS_BLOCK_DIM_Y+1];
#define x0y0 _x0y0 [ threadIdx.y ][ threadIdx.x ]
#define x0y1 _x0y1 [ threadIdx.y ][ threadIdx.x ]
#define x1y0 _x1y0 [ threadIdx.y ][ threadIdx.x ]
#define x1y1 _x1y1 [ threadIdx.y ][ threadIdx.x ]
#define result _result [ threadIdx.y ][ threadIdx.x ]
x0y0 = tex2D ( rgbImageTex, SRCX, SRCY );
x0y1 = tex2D ( rgbImageTex, SRCX, SRCY+1 );
x1y1 = tex2D ( rgbImageTex, SRCX+1, SRCY+1 );
x1y0 = tex2D ( rgbImageTex, SRCX+1, SRCY );
result = bilerp_c3 ( x0y0, x0y1, x1y1, x1y0, dx, dy );
*(uchar4*)(imgDst + lookup(x, y, imgDstP)) = result;
}
else
{
// just copy the boundary pixels
*(uchar4*)(imgDst + lookup(x, y, imgDstP)) = tex2D ( rgbImageTex, SRCX, SRCY );
}
#undef x0y0
#undef x0y1
#undef x1y1
#undef x1y0
#undef lookup
#undef SRCX
#undef SRCY
#undef dx
#undef dy
}
inline __device__ uchar4 bilerp_c3 ( uchar4 x1y1, uchar4 x1y2, uchar4 x2y2, uchar4 x2y1, float dx, float dy )
{
#define OMDX (1.0f - dx)
#define OMDY (1.0f - dy)
return make_uchar4 ((unsigned char)(( OMDX * OMDY * (float)x1y1.x) + (OMDX * dy * (float)x1y2.x) + (dx * OMDY * (float)x2y1.x) + (dx * dy * (float)x2y2.x )),
(unsigned char)(( OMDX * OMDY * (float)x1y1.y) + (OMDX * dy * (float)x1y2.y) + (dx * OMDY * (float)x2y1.y) + (dx * dy * (float)x2y2.y )),
(unsigned char)(( OMDX * OMDY * (float)x1y1.z) + (OMDX * dy * (float)x1y2.z) + (dx * OMDY * (float)x2y1.z) + (dx * dy * (float)x2y2.z )),
0 );
#undef OMDX
#undef OMDY
}