why copy using shared memory is faster than direct copy

Hi, I am new to this GPU programming and I am currently reading a simple CUDA example about copying one matrix to another one:

//a simple copy
global void copy(float odata, const float idata){
int x = blockIdx.x
TILE_DIM + threadIdx.x;
int y = blockIdx.y
TILE_DIM + threadIdx.y;
int width = gridDim.x*TILE_DIM;

for(int j = 0; j < TILE_DIM; j+= BLOCK_ROWS) //? why blocking?
    odata[(y+j)*width + x] = idata[(y+j)*width + x];

}

//copy using shared memory
global void copySharedMem(float odata, const floatidata){
shared float tile[TILE_DIM*TILE_DIM];

int x = blockIdx.x*TILE_DIM + threadIdx.x;
int y = blockIdx.y*TILE_DIM + threadIdx.y;
int width = gridDim.x*TILE_DIM;

for(int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
    tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x] = idata[(y+j)*width + x];

__syncthreads();
for(int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
    odata[(y+j)*width + x] = tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x];

}

The test results show that the second one is faster by a factor of 7%, but I don’t understand why. It seems to me that the second one involves one more copy. Could someone give me some hints? Thanks!

Perhaps you’ve made a benchmarking error. Questions like this are difficult to address without a completely specified test case, in my opinion.

A simple test case built from your code indicates to me that copy is not slower than copySharedMem:

$ cat t393.cu
const int TILE_DIM=32;
const int BLOCK_ROWS=32;
//a simple copy
__global__ void copy(float *odata, const float *idata){
int x = blockIdx.x*TILE_DIM + threadIdx.x;
int y = blockIdx.y*TILE_DIM + threadIdx.y;
int width = gridDim.x*TILE_DIM;

for(int j = 0; j < TILE_DIM; j+= BLOCK_ROWS) //? why blocking?
odata[(y+j)*width + x] = idata[(y+j)*width + x];
}



//copy using shared memory
__global__ void copySharedMem(float *odata, const float*idata){
__shared__ float tile[TILE_DIM*TILE_DIM];

int x = blockIdx.x*TILE_DIM + threadIdx.x;
int y = blockIdx.y*TILE_DIM + threadIdx.y;
int width = gridDim.x*TILE_DIM;

for(int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x] = idata[(y+j)*width + x];

__syncthreads();
for(int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x];
}


int main(){

  float *o, *i;
  const int ds = 1024;
  const int dsize = ds*ds*sizeof(o[0]);
  cudaMalloc(&o, dsize);
  cudaMalloc(&i, dsize);
  const int bs = 32;
  dim3 block(bs,bs);
  dim3 grid(ds/bs, ds/bs);
  copy<<<grid,block>>>(o, i);
  copy<<<grid,block>>>(o, i);
  copySharedMem<<<grid,block>>>(o, i);
  copySharedMem<<<grid,block>>>(o, i);
  cudaDeviceSynchronize();
}

$ nvcc -o t393 t393.cu
$ nvprof ./t393
==10333== NVPROF is profiling process 10333, command: ./t393
==10333== Profiling application: ./t393
==10333== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   51.28%  42.208us         2  21.104us  21.024us  21.184us  copySharedMem(float*, float const *)
                   48.72%  40.095us         2  20.047us  18.879us  21.216us  copy(float*, float const *)
      API calls:   98.16%  390.10ms         2  195.05ms  275.50us  389.82ms  cudaMalloc
                    1.23%  4.9023ms       384  12.766us     370ns  520.99us  cuDeviceGetAttribute
                    0.43%  1.6923ms         4  423.08us  250.65us  682.53us  cuDeviceTotalMem
                    0.13%  516.04us         4  129.01us  98.644us  204.99us  cuDeviceGetName
                    0.03%  103.92us         4  25.979us  11.551us  62.850us  cudaLaunchKernel
                    0.01%  45.516us         1  45.516us  45.516us  45.516us  cudaDeviceSynchronize
                    0.01%  22.860us         4  5.7150us  3.8420us  8.4600us  cuDeviceGetPCIBusId
                    0.00%  8.7950us         8  1.0990us     478ns  2.5720us  cuDeviceGet
                    0.00%  3.9900us         3  1.3300us     374ns  2.1980us  cuDeviceGetCount
                    0.00%  2.6200us         4     655ns     577ns     753ns  cuDeviceGetUuid
$

Tesla P100, CUDA 10.0, CentOS 7

Hi Robert,

Thanks very much for creating this test example! I just re-checked the code I was looking at. I found one optimization in the original code that helps to accelerate copySharedMem(). In the original code, instead of using BLOCK_ROWS=TILE_DIM=32, it uses BLOCK_ROWS=8.

const int TILE_DIM=32;
const int BLOCK_ROWS=8;
//a simple copy
__global__ void copy(float *odata, const float *idata){
    int x = blockIdx.x*TILE_DIM + threadIdx.x;
    int y = blockIdx.y*TILE_DIM + threadIdx.y;
    int width = gridDim.x*TILE_DIM;
    
    for(int j = 0; j < TILE_DIM; j+= BLOCK_ROWS) //? why blocking?
        odata[(y+j)*width + x] = idata[(y+j)*width + x];
}

//copy using shared memory
__global__ void copySharedMem(float *odata, const float*idata){
    __shared__ float tile[TILE_DIM*TILE_DIM];
    
    int x = blockIdx.x*TILE_DIM + threadIdx.x;
    int y = blockIdx.y*TILE_DIM + threadIdx.y;
    int width = gridDim.x*TILE_DIM;
    
    for(int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
        tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x] = idata[(y+j)*width + x];
    
    __syncthreads();
    for(int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
        odata[(y+j)*width + x] = tile[(threadIdx.y+j)*TILE_DIM + threadIdx.x];
}

int main(){
    
    int num_reps=200;
    
    float *o, *i;
    const int ds = 1024;
    const int dsize = ds*ds*sizeof(o[0]);
    cudaMalloc(&o, dsize);
    cudaMalloc(&i, dsize);
    dim3 block(TILE_DIM,BLOCK_ROWS);
    dim3 grid(ds/TILE_DIM, ds/TILE_DIM);
    for(int n=0; n<num_reps; n++)
        copySharedMem<<<grid,block>>>(o, i);
    for(int n=0; n<num_reps; n++)
        copy<<<grid,block>>>(o, i);
    cudaDeviceSynchronize();
}

And now copysharedMem() consistently takes less time than copy(). However, I haven’t figured out why this blocking trick helps to accelerate copy through shared memory. I am also using a Telsa P100 (one GPU of IBM Power8)

Case 1: TILE_DIM=32, BLOCK_ROWS=8

$ mpirun -np 1 nvprof ./main
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   51.18%  3.5459ms       200  17.729us  17.378us  18.595us  copy(float*, float const *)
                   48.82%  3.3828ms       200  16.914us  16.226us  17.986us  copySharedMem(float*, float const *)

Case 2: TILE_DIM=32, BLOCK_ROWS=32

$ mpirun -np 1 nvprof ./main
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   53.69%  4.2012ms       200  21.006us  20.387us  21.955us  copySharedMem(float*, float const *)
                   46.31%  3.6243ms       200  18.121us  17.347us  18.947us  copy(float*, float const *)

Thanks,
M.

My guess would be bus turnaround.

If you pull a bunch of stuff into shared memory, and then blast it out, you can get some small benefit from the fact that you are doing a bunch of reads followed by a bunch of writes.

The copy code on the other hand has reads and writes possibly closer to each other requiring turnaround on the DRAM bus.

This is just a guess, however I know that it can create about a 10% measured improvement in bandwidth testing on a P100 if you use a copy kernel as opposed to a kernel that simply reads memory in bulk, or writes memory in bulk. If you only do reads, or only do writes, you can witness higher bandwidth than if you do a mix of both. It stands to reason that if you introduce bursty behavior, that may also be better than just doing read/write/read/write…

But it’s just speculation.

Hi Robert,

Thanks for providing a speculation of “bus turnaround”! Interesting and reasonable!

Thanks,
M.