Hi, how can i use each thread to read a block as big as possible from device memory to shared memory? the threads are not read-coalesced. Thanks!
You can create an union which size is equal to 16 bytes and read it at once:
union __align(16)__ un
{
short sh[8];
};
__shared__ short block[64];
__shared__ un uData;
__global__ void kernel(short *data, int width)
{
extern __shared__ short block[];
extern __shared__ un uData;
int x = blockIdx.x;
int y = blockIdx.y;
int tid = threadIdx.x;
int offs = ((y*8 + tid)*width + x*8) >> 3;
un *puIn = (un *)data + offs;
un *puOut = (un *)block + tid;
uData = puIn[0];
puOut[0] = uData;
}
Thank you very much!
I was thinking using texture…