Hi,
I’m actually trying trying some basic examples to try optimizing shared memory access by avoiding shared memory conflicts.
So I have two kernels : one theoretically optimized, one not.
The thing is that when I measure the time spent by eahc kernel, the not optimized one takes much less time.
Here is the code. It’s very basic so I hope you can give a look at it. Only the accesses to temp_result part is modified, the rest of the code remains untouched.
Thanks in advance,
Vince
dim3 blocks(16, 16, 1);
dim3 grids(dimx / blocks.x + (dimx%blocks.x?1:0), dimz*dimy/4 / blocks.y + (dimy%blocks.y?1:0));
__global__ void morpho_optimize(unsigned char* odata, unsigned char* idata, int mem_size)
{
__shared__ unsigned char temp_result[BLOCK_SIZE_Y][4*BLOCK_SIZE_X];
uchar4 pixels4;
int xindex = (blockIdx.x * blockDim.x) + threadIdx.x;
int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
int index = (gridDim.x * blockDim.x * yindex) + xindex;
if ( index < mem_size )
{
// load 4 pixels
pixels4 = ((uchar4*)idata)[index];
// compare each pixel to THRESH
temp_result[threadIdx.y][threadIdx.x] = pixels4.x < THRESH ? 0 : 255;
__syncthreads();
temp_result[threadIdx.y][blockDim.x+threadIdx.x] = pixels4.y < THRESH ? 0 : 255;
__syncthreads();
temp_result[threadIdx.y][2*blockDim.x+threadIdx.x] = pixels4.z < THRESH ? 0 : 255;
__syncthreads();
temp_result[threadIdx.y][3*blockDim.x+threadIdx.x] = pixels4.w < THRESH ? 0 : 255;
__syncthreads();
// store the 4-pixel add back to mem
pixels4.x = temp_result[threadIdx.y][threadIdx.x];
__syncthreads();
pixels4.y = temp_result[threadIdx.y][blockDim.x+threadIdx.x];
__syncthreads();
pixels4.z = temp_result[threadIdx.y][2*blockDim.x+threadIdx.x];
__syncthreads();
pixels4.w = temp_result[threadIdx.y][3*blockDim.x+threadIdx.x];
__syncthreads();
((uchar4*)odata)[index] = pixels4;
}
}
__global__ void morpho(unsigned char* odata, unsigned char* idata, int mem_size)
{
__shared__ unsigned char temp_result[BLOCK_SIZE_Y][4*BLOCK_SIZE_X];
uchar4 pixels4;
int xindex = (blockIdx.x * blockDim.x) + threadIdx.x;
int yindex = (blockIdx.y * blockDim.y) + threadIdx.y;
int index = (gridDim.x * blockDim.x * yindex) + xindex;
if ( index < mem_size )
{
// load 4 pixels
pixels4 = ((uchar4*)idata)[index];
// compare each pixel to THRESH
temp_result[threadIdx.y][4*threadIdx.x] = pixels4.x < THRESH ? 0 : 255;
temp_result[threadIdx.y][4*threadIdx.x+1] = pixels4.y < THRESH ? 0 : 255;
temp_result[threadIdx.y][4*threadIdx.x+2] = pixels4.z < THRESH ? 0 : 255;
temp_result[threadIdx.y][4*threadIdx.x+3] = pixels4.w < THRESH ? 0 : 255;
// store the 4-pixel add back to mem
pixels4.x = temp_result[threadIdx.y][4*threadIdx.x];
pixels4.y = temp_result[threadIdx.y][4*threadIdx.x+1];
pixels4.z = temp_result[threadIdx.y][4*threadIdx.x+2];
pixels4.w = temp_result[threadIdx.y][4*threadIdx.x+3];
((uchar4*)odata)[index] = pixels4;
}
}