__global__ void CUDA_set_memory_plane16 (int16_t *addr,int c,uint32_t stride,uint32_t stride2)
{
unsigned int bx = blockIdx.x;
unsigned int by = blockIdx.y;
unsigned int tx = threadIdx.x;
unsigned int ty = threadIdx.y;
__shared__ int16_t cache[16*16];
unsigned int ox,oy;
ox= ((tx<<1)&15)+((ty>>3)&1);
oy=((ty<<1)&15) + ((tx>>3)&1);
cache[ty*16+tx]=addr[(by*stride2+ty)*stride+bx*stride2+tx];
cache[ty*16+tx]+=c+(ox+oy);
addr[(by*stride2+ty)*stride+bx*stride2+tx]=cache[ty*16+tx];
}
__global__ void CUDA_set_memory_plane16_opt (int16_t *addr,int c,uint32_t stride,uint32_t stride2)
{
unsigned int bx = blockIdx.x;
unsigned int by = blockIdx.y;
unsigned int tx = threadIdx.x;
unsigned int ty = threadIdx.y;
__shared__ int16_t cache[16*16];
unsigned int ox,oy;
ox= ((tx<<1)&15)+((ty>>3)&1);
oy=((ty<<1)&15) + ((tx>>3)&1);
cache[oy*16+ox]=addr[(by*stride2+ty)*stride+bx*stride2+tx];
cache[oy*16+ox]+=c+(tx+ty);
addr[(by*stride2+ty)*stride+bx*stride2+tx]=cache[oy*16+ox];
}
opt version has no warp serialize,but time increase in visual profiler
main file:
cudaMalloc((void**) &data16, sizeof(int16_t)*16*4*16*32);
dim3 block(4,32);
dim3 thread(16,16);
int i;
for(i=0;i<200;i++)
CUDA_set_memory_plane16<<<block,thread>>>(data16,20,16*4,16);
for(i=0;i<200;i++)
CUDA_set_memory_plane16_opt<<<block,thread>>>(data16,20,16*4,16);
cudaFree(data16);
opt version average time about 11.8
normal version about 10.4
why ?