When I try to understand the matrix transpose project which is packed under cuda SDK/project/transpose/
I noticed that the kernel “transpose_naive” looks did not do anything to do the read/write synchronization to make sure the correct value is read and write.
Can anybody point out how cuda make sure the output of “transpose_naive” is correct?
Thank you very much!
// This naive transpose kernel suffers from completely non-coalesced writes.
// It can be up to 10x slower than the kernel above for large matrices.
__global__ void transpose_naive(float *odata, float* idata, int width, int height)
{
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
if (xIndex < width && yIndex < height)
{
unsigned int index_in = xIndex + width * yIndex;
unsigned int index_out = yIndex + height * xIndex;
odata[index_out] = idata[index_in];
}
}
When I try to understand the matrix transpose project which is packed under cuda SDK/project/transpose/
I noticed that the kernel “transpose_naive” looks did not do anything to do the read/write synchronization to make sure the correct value is read and write.
Can anybody point out how cuda make sure the output of “transpose_naive” is correct?
Thank you very much!
[codebox]__global__ void transpose(float *odata, float *idata, int width, int height)
I thought we use shared memory to boost the performance is mainly because we can write a whole block of result back to device memory instead of write them one by one, therefore we can save some time by accessing much less of times device memory ( instead of accessing device memory at number of threads times, we only need ( number of threads / (blockdim * blockdim) ) times accessing.
So if we change the transpose code to make it do some simple copy operation, I am expecting that block copy will still be faster than thread by thread copy. But my testing tells me that thread by thread copy is faster than block copy. Can anybody point out why?
Here is the code:
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil_inline.h>
const unsigned int BLOCK_DIM = 16;
// kernel copy
// device memory copy entry by entry
__global__ void copy(float *outData, float* inData, unsigned int width, unsigned int height)
{
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.x + threadIdx.y;
if( xIndex < width && yIndex < height )
{
unsigned int index = yIndex * width + xIndex;
outData[index] = inData[index];
}
}
// kernel copy
// shared memory temporary save, block copy back to device memory
__global__ void blockCopy(float* outData, float* inData, unsigned int width, unsigned int height)
{
__shared__ float block[BLOCK_DIM][BLOCK_DIM];
unsigned int xIndex = blockIdx.x* BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y* BLOCK_DIM + threadIdx.y;
unsigned int index = yIndex * width + xIndex;
if(xIndex < width && yIndex < height)
{
block[threadIdx.x][threadIdx.y] = inData[index];
}
__syncthreads();
outData[index] = block[threadIdx.x][threadIdx.y];
}
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
runTest( argc, argv);
cutilExit(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest( int argc, char** argv)
{
// size of the matrix
const unsigned int width = 1024;
const unsigned int height = 1024;
// size of memory required to store the matrix
const unsigned int memSize = sizeof(float) * width * height;
unsigned int timer;
cutCreateTimer(&timer);
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
// allocate memory on host
float *h_idata = (float*) malloc(memSize);
// initalize the memory
for( unsigned int i = 0; i < (width*height); ++i)
{
h_idata[i] = (float) i;
}
// allocate memory on device
float* d_idata; // input memory
float* d_odata; // output memory
cutilSafeCall( cudaMalloc( (void**) &d_idata, memSize ));
cutilSafeCall( cudaMalloc( (void**) &d_odata, memSize ));
// copy host memory to device
cutilSafeCall( cudaMemcpy( d_idata, h_idata, memSize, cudaMemcpyHostToDevice ));
// set up grid, block dimensions
dim3 grid(width/ BLOCK_DIM, height/BLOCK_DIM, 1);
dim3 block(BLOCK_DIM, BLOCK_DIM, 1);
// warmup so we don't time CUDA startup
copy<<< grid, block>>>( d_odata, d_idata, width, height);
blockCopy<<< grid, block >>>( d_odata, d_idata, width, height);
cudaThreadSynchronize();
printf("Copying a %d by %d matrix of floats...\n", width, height);
int numIterations = 4;
// execute the kernel: copy
cutStartTimer(timer);
for( unsigned int i = 0; i < numIterations; i++ )
{
copy<<< grid, block>>>( d_odata, d_idata, width, height);
}
cudaThreadSynchronize();
cutStopTimer(timer);
float copyTime = cutGetTimerValue(timer);
// execute the kernel: blockCopy
cutResetTimer(timer);
cutStartTimer(timer);
for( unsigned int i = 0; i < numIterations; i++ )
{
blockCopy<<< grid, block >>>( d_odata, d_idata, width, height);
}
cudaThreadSynchronize();
cutStopTimer(timer);
float blockCopyTime = cutGetTimerValue(timer);
printf("copy average time: %0.3f ms\n", copyTime / numIterations);
printf("block copy average time: %0.3f ms\n\n", blockCopyTime / numIterations);
// check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed");
// cleanup memory
free(h_idata);
cutilSafeCall(cudaFree(d_idata));
cutilSafeCall(cudaFree(d_odata));
cutilCheckError( cutDeleteTimer(timer));
cudaThreadExit();
}
first I need to correct your kernel “blockCopy”, you should issue boundary condition again after synch.
—Thank you!
remember that GPU cannot copy data from global memory to shared memory directly,
it will copy data to register, then move data from register to shared memory.Nice, That was my blind spot!
I am still a bit confused about which part boost the performance? Based on my understanding about the code, both transpose and transpose_naive are doing the same operation when they are reading from the global memory but writing to different places. And they are reading continues memory block( best case we can have), so no potential for improvement.
The two procedures differs when they are trying to write result back to global memory, __syncthreads() is necessary only for optimized transfer to make sure that the correct result is written back to global memory.
Is it the writing of transpose beat the transpose_naive because it is writing block by block instead of thread by thread?
Then, my question is, which function call/flag tells it to write block by block? How could a programmer know it will write block by block or thread by thread?
question : Is it the writing of transpose beat the transpose_naive because it is writing block by block instead of thread by thread?
the key is "does a warp access continuous pattern?". for naive method, write of a warp access discontinuous pattern
but for optimized method, rite of a warp access continuous pattern.
when warp accesses continuous pattern, then hardware would group these oattern into a big transaction, then
number of read/write would decrease, that is why optimized method is faster than naive metheod.
please check section 5.1.2.1 in programming guide 2.3