Hello,
I have been looking at the fine grained kernel in the transposeNew example. I read the white paper that is included with the 2.3 CUDA SDK and got interested in changing the fine grained kernel to transpose bigger chunks of the matrix at a time. Like for example 64 x 64, instead of 32 x 32 like the example shows. I know that there is only 16KB of shared memory on each multiprocessor so trying to fit the whole sub-matrix in shared memory is impossible. I wrote a prototype kernel that is supposed to do a batch transpose of 64x64 sub matrices on a 1024 x 1024 matrix. The kernel is launched with a grid size of 16 x 16 and 64x8 threads. Any help is very welcome.
[codebox]
global void tran64(float2 * odata, float2 * idata, int width, int height)
{
shared float blockx[TILE_64*(TILE_8+1)];
shared float blocky[TILE_64*(TILE_8+1)];
int index_block = threadIdx.x + threadIdx.y*TILE_64;
int index_transpose = threadIdx.y + threadIdx.x*TILE_8;
int xIndex = blockIdx.x * TILE_64 + threadIdx.x;
int yIndex = blockIdx.y * TILE_64 + threadIdx.y;
int index = xIndex + yIndex*width;
for (int i = 0; i < 8; i++) {
blockx[index_block] = idata[index + width*i*TILE_8].x;
blocky[index_block] = idata[index + width*i*TILE_8].y;
__syncthreads();
odata[index + width*i].x = blockx[index_transpose];
odata[index + width*i].y = blocky[index_transpose];
}
}
[/codebox]
Thanks
I decided to try a different method and loop over 32x32 tiles 4 times within a 64x64 tile. It worked. Here is my kernel code. It hasn’t been optimized yet so it could be made better.
[codebox]
global void tran64(float2 * odata, float2 *idata, int width, int height)
{
shared float blockx[TILE_DIM][TILE_DIM+1];
shared float blocky[TILE_DIM][TILE_DIM+1];
//Loop over the four inner tiles
for (int b = 0; b < 4; b++) {
int offset_inx_outy = b&1; /* b % 2 */
int offset_iny_outx = b>>1; /* b / 2 */
int xIndex = blockIdx.x*TILE_64 + offset_inx_outy* TILE_DIM + threadIdx.x;
int yIndex = blockIdx.y*TILE_64 + offset_iny_outx* TILE_DIM + threadIdx.y;
int index = xIndex + yIndex*width;
for (int i = 0; i < TILE_DIM; i+=BLOCK_ROWS) {
blockx[threadIdx.y+i][threadIdx.x] = idata[index + i*width].x;
blocky[threadIdx.y+i][threadIdx.x] = idata[index + i*width].y;
}
int xIndex2 = blockIdx.xTILE_64 + offset_iny_outxTILE_DIM + threadIdx.x;
int yIndex2 = blockIdx.y*TILE_64 + offset_inx_outy*TILE_DIM + threadIdx.y;
int index_out = xIndex2 + yIndex2*width;
__syncthreads();
for (int i = 0; i < TILE_DIM; i+=BLOCK_ROWS) {
odata[index_out+i*height].x = blockx[threadIdx.x][threadIdx.y+i];
odata[index_out+i*height].y = blocky[threadIdx.x][threadIdx.y+i];
}
}
}
[/codebox]