hey guys,
relating to the kernel of matrix transpose, i have implemented my design which was the same as the “navie” design which in the SDK , after that i take a look of the optimal design , i have understand it , but i don’t know what’s the source of the performance in that design , you may look at the code , it’s doing almost the same thing in a little diffrence that it saves the values in temporary matrix (shared) , after that it copys them to the destination , so we have 2 cycles of memory read/ write , but also in the navie design the program looks at A(x,y) [memory read]
and copy it to B(y,x) [memory write] , so it’s the same relating to read\write cycles , but if you run the 2 kernels you will see that design B is too faster than the naive one ,
can you tell me what’s the diffrence ?
Design A: navie design for matrix transpose
__global__ void transpose_naive(float *odata, float* idata, int width, int height)
{
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
if (xIndex < width && yIndex < height)
{
unsigned int index_in = xIndex + width * yIndex;
unsigned int index_out = yIndex + height * xIndex;
odata[index_out] = idata[index_in];
}
}
Design B: Optimized kernel for matrix transpose
__global__ void transpose(float *odata, float *idata, int width, int height)
{
__shared__ float block[BLOCK_DIM][BLOCK_DIM+1];
// read the matrix tile into shared memory
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
//if((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
block[threadIdx.y][threadIdx.x] = idata[index_in];
}
__syncthreads();
// write the transposed matrix tile to global memory
xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;
//if((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex * height + xIndex;
odata[index_out] = block[threadIdx.x][threadIdx.y];
}
}