Hi All,
I am reimplementing a kernel which previously takes 2 ms. Now I want to use shared memory inside this kernel to improve performance more.
But after shared memory implementation it takes 4.3 ms. When I debug what I got is – taking data into shared memory takes 2.2 ms.
For checking this I separated that part of kernel which is taking data into shared memory. I am giving that separate part kernel :
__global__
void getImgValue( long dstW, long dstH )
{
__shared__ unsigned short int shGray[192][26];
//-- current thread
int x = __umul24(blockIdx.x,blockDim.x) + threadIdx.x;
int y = __umul24(blockIdx.y,blockDim.y) + threadIdx.y;
long imgInd = (y * dstW + x);
int tid = (threadIdx.y << 4) + threadIdx.x;
shGray[tid][0] = tex1Dfetch(TextImgBuff, imgInd + 0 );
shGray[tid][1] = tex1Dfetch(TextImgBuff, imgInd + 1 );
shGray[tid][2] = tex1Dfetch(TextImgBuff, imgInd + 2 );
shGray[tid][3] = tex1Dfetch(TextImgBuff, imgInd + 3 );
shGray[tid][4] = tex1Dfetch(TextImgBuff, imgInd + 4 );
shGray[tid][5] = tex1Dfetch(TextImgBuff, imgInd + dstW + 0 );
shGray[tid][6] = tex1Dfetch(TextImgBuff, imgInd + dstW + 1 );
shGray[tid][7] = tex1Dfetch(TextImgBuff, imgInd + dstW + 2 );
shGray[tid][8] = tex1Dfetch(TextImgBuff, imgInd + dstW + 3 );
shGray[tid][9] = tex1Dfetch(TextImgBuff, imgInd + dstW + 4 );
shGray[tid][10] = tex1Dfetch(TextImgBuff, imgInd+ (dstW << 1) + 0 );
shGray[tid][11] = tex1Dfetch(TextImgBuff, imgInd+ (dstW << 1) + 1 );
shGray[tid][12] = tex1Dfetch(TextImgBuff, imgInd+ (dstW << 1) + 2 );
shGray[tid][13] = tex1Dfetch(TextImgBuff, imgInd+ (dstW << 1) + 3 );
shGray[tid][14] = tex1Dfetch(TextImgBuff, imgInd+ (dstW << 1)+ 4 );
shGray[tid][15] = tex1Dfetch(TextImgBuff, imgInd+ 3*dstW + 0 );
shGray[tid][16] = tex1Dfetch(TextImgBuff, imgInd+ 3*dstW + 1 );
shGray[tid][17] = tex1Dfetch(TextImgBuff, imgInd+ 3*dstW + 2 );
shGray[tid][18] = tex1Dfetch(TextImgBuff, imgInd+ 3*dstW + 3 );
shGray[tid][19] = tex1Dfetch(TextImgBuff, imgInd+ 3*dstW + 4 );
shGray[tid][20] = tex1Dfetch(TextImgBuff, imgInd+ (dstW << 2) + 0 );
shGray[tid][21] = tex1Dfetch(TextImgBuff, imgInd+ (dstW << 2) + 1 );
shGray[tid][22] = tex1Dfetch(TextImgBuff, imgInd+ (dstW << 2) + 2 );
shGray[tid][23] = tex1Dfetch(TextImgBuff, imgInd+ (dstW << 2) + 3 );
shGray[tid][24] = tex1Dfetch(TextImgBuff, imgInd+ (dstW << 2) + 4 );
}
Why this kernel taking time, can any one point out?