Hi All,

In my kernel

```
__global__
void getValue( unsigned char *Value, long xValue, long yValue )
{
long limit = __mul24( xValue, yValue); // Here xValue = 2000 & yValue = 1500.
long idx = __mul24( blockIdx.x , blockDim.x ) + threadIdx.x;
if( idx < limit )
{
int y = ((idx) / (xValue));
int y_ = __mul24(y,8);
int dt = __mul24(y, idx);
for ( int i = 0; i < 4; i++ )
{
for ( int l = 0; l < 8; l++ )
{
int dl = tex1Dfetch(hDY, y_+l ) + dt + i;
int ind = __mul24(idx, 256) + __mul24(i, 64) + l*8;
Value[ind ] = tex1Dfetch( hTexture, (0 << 2)+dl );
Value[ind+1] = tex1Dfetch( hTexture, (1 << 2)+dl );
Value[ind+2] = tex1Dfetch( hTexture, (2 << 2)+dl );
Value[ind+3] = tex1Dfetch( hTexture, (3 << 2)+dl );
Value[ind+4] = tex1Dfetch( hTexture, (4 << 2)+dl );
Value[ind+5] = tex1Dfetch( hTexture, (5 << 2)+dl );
Value[ind+6] = tex1Dfetch( hTexture, (6 << 2)+dl );
Value[ind+7] = tex1Dfetch( hTexture, (7 << 2)+dl );
}
}
}
}
```

This is called using 256 threads per block.

My problem is:

This functin execution time is **93 ms **. I am not understanding why it takes so much time?