resolve warp serialize,but time increase

__global__ void CUDA_set_memory_plane16 (int16_t *addr,int c,uint32_t stride,uint32_t stride2)

{

	unsigned int bx = blockIdx.x;

	unsigned int by = blockIdx.y;

	unsigned int tx = threadIdx.x;

	unsigned int ty = threadIdx.y;

	__shared__ int16_t cache[16*16];

	

	unsigned int ox,oy;

	

	ox= ((tx<<1)&15)+((ty>>3)&1);

	oy=((ty<<1)&15) + ((tx>>3)&1);

	cache[ty*16+tx]=addr[(by*stride2+ty)*stride+bx*stride2+tx];

	cache[ty*16+tx]+=c+(ox+oy);

	addr[(by*stride2+ty)*stride+bx*stride2+tx]=cache[ty*16+tx];

}

__global__ void CUDA_set_memory_plane16_opt (int16_t *addr,int c,uint32_t stride,uint32_t stride2)

{

	unsigned int bx = blockIdx.x;

	unsigned int by = blockIdx.y;

	unsigned int tx = threadIdx.x;

	unsigned int ty = threadIdx.y;

	__shared__ int16_t cache[16*16];

	unsigned int ox,oy;

	ox= ((tx<<1)&15)+((ty>>3)&1);

	oy=((ty<<1)&15) + ((tx>>3)&1);

	cache[oy*16+ox]=addr[(by*stride2+ty)*stride+bx*stride2+tx];

	cache[oy*16+ox]+=c+(tx+ty);

	addr[(by*stride2+ty)*stride+bx*stride2+tx]=cache[oy*16+ox];

}

opt version has no warp serialize,but time increase in visual profiler

main file:

cudaMalloc((void**) &data16, sizeof(int16_t)*16*4*16*32);

	dim3 block(4,32);

	dim3 thread(16,16);

	int i;

	for(i=0;i<200;i++)

	CUDA_set_memory_plane16<<<block,thread>>>(data16,20,16*4,16);

	for(i=0;i<200;i++)

	CUDA_set_memory_plane16_opt<<<block,thread>>>(data16,20,16*4,16);

	

	cudaFree(data16);

opt version average time about 11.8

normal version about 10.4

why ?

__global__ void CUDA_set_memory_plane16 (int16_t *addr,int c,uint32_t stride,uint32_t stride2)

{

	unsigned int bx = blockIdx.x;

	unsigned int by = blockIdx.y;

	unsigned int tx = threadIdx.x;

	unsigned int ty = threadIdx.y;

	__shared__ int16_t cache[16*16];

	

	unsigned int ox,oy;

	

	ox= ((tx<<1)&15)+((ty>>3)&1);

	oy=((ty<<1)&15) + ((tx>>3)&1);

	cache[ty*16+tx]=addr[(by*stride2+ty)*stride+bx*stride2+tx];

	cache[ty*16+tx]+=c+(ox+oy);

	addr[(by*stride2+ty)*stride+bx*stride2+tx]=cache[ty*16+tx];

}

__global__ void CUDA_set_memory_plane16_opt (int16_t *addr,int c,uint32_t stride,uint32_t stride2)

{

	unsigned int bx = blockIdx.x;

	unsigned int by = blockIdx.y;

	unsigned int tx = threadIdx.x;

	unsigned int ty = threadIdx.y;

	__shared__ int16_t cache[16*16];

	unsigned int ox,oy;

	ox= ((tx<<1)&15)+((ty>>3)&1);

	oy=((ty<<1)&15) + ((tx>>3)&1);

	cache[oy*16+ox]=addr[(by*stride2+ty)*stride+bx*stride2+tx];

	cache[oy*16+ox]+=c+(tx+ty);

	addr[(by*stride2+ty)*stride+bx*stride2+tx]=cache[oy*16+ox];

}

opt version has no warp serialize,but time increase in visual profiler

main file:

cudaMalloc((void**) &data16, sizeof(int16_t)*16*4*16*32);

	dim3 block(4,32);

	dim3 thread(16,16);

	int i;

	for(i=0;i<200;i++)

	CUDA_set_memory_plane16<<<block,thread>>>(data16,20,16*4,16);

	for(i=0;i<200;i++)

	CUDA_set_memory_plane16_opt<<<block,thread>>>(data16,20,16*4,16);

	

	cudaFree(data16);

opt version average time about 11.8

normal version about 10.4

why ?

I got problem.my function is not heavy enough.thanks everybody.

I got problem.my function is not heavy enough.thanks everybody.