I’m optimizing a kernel that analyzes a row-major matrix across columns. The thread must retain state, therefore it can’t simply be a monolithic kernel. The natural thing to do is to have one thread per column. I wrote a kernel that merely copies data while traversing the matrix columns, to measure the upper bound of bandwidth.

I noticed that if each thread loops across the whole column, it get a lower bandwidth than having a shorter span, while splitting the column into separate blocks. What is the reason for that, if the number of threads is constant and memory access is theoretically coalesced?

I modified the code from this post (https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/ , An Efficient Matrix Transpose in CUDA C/C++) trying to understand what is going on. I always have 1024 threads, analyzing a 1024x1024 matrix. In one extreme each thread is moving across a whole column. If I split each column in two, with two threads per column, keeping the number of vertical blocks == 1 and doubling the number of horizontal blocks, the kernel runs faster, until I reach the point where each thread moves only across 32 values, and we need 32 blocks to cover the whole matrix.

The number of threads is constant, and it’s a simple copy kernel. Memory should be coalesced, if I didn’t make any mistakes. There’s no use of shared memory to cause conflicts. What is the reason for this change in bandwidth?

Should I always divide my workload in tiles somehow, and avoid threads that cover long ranges of memory in a for-loop? What would be the reason for that, if memory access by the warp is always linear?

```
Device : Quadro P1000
Matrix size: 1024 1024, Block size: 1024 1, Tile size: 1024 1024
dimGrid: 1 1 1. dimBlock: 1024 1 1
Routine Bandwidth (GB/s)
copy_scattered 23.62
copy_blocky 23.27
Matrix size: 1024 1024, Block size: 1024 2, Tile size: 1024 1024
dimGrid: 2 1 1. dimBlock: 512 2 1
Routine Bandwidth (GB/s)
copy_scattered 39.53
copy_blocky 38.22
Matrix size: 1024 1024, Block size: 1024 4, Tile size: 1024 1024
dimGrid: 4 1 1. dimBlock: 256 4 1
Routine Bandwidth (GB/s)
copy_scattered 64.82
copy_blocky 55.65
Matrix size: 1024 1024, Block size: 1024 8, Tile size: 1024 1024
dimGrid: 8 1 1. dimBlock: 128 8 1
Routine Bandwidth (GB/s)
copy_scattered 75.89
copy_blocky 80.02
Matrix size: 1024 1024, Block size: 1024 16, Tile size: 1024 1024
dimGrid: 16 1 1. dimBlock: 64 16 1
Routine Bandwidth (GB/s)
copy_scattered 79.81
copy_blocky 79.70
Matrix size: 1024 1024, Block size: 1024 32, Tile size: 1024 1024
dimGrid: 32 1 1. dimBlock: 32 32 1
Routine Bandwidth (GB/s)
copy_scattered 75.72
copy_blocky 75.57
Matrix size: 1024 1024, Block size: 1024 64, Tile size: 1024 1024
dimGrid: 64 1 1. dimBlock: 16 64 1
Routine Bandwidth (GB/s)
copy_scattered 74.10
copy_blocky 73.41
Matrix size: 1024 1024, Block size: 1024 128, Tile size: 1024 1024
dimGrid: 128 1 1. dimBlock: 8 128 1
Routine Bandwidth (GB/s)
copy_scattered 53.39
copy_blocky 49.87
Matrix size: 1024 1024, Block size: 1024 256, Tile size: 1024 1024
dimGrid: 256 1 1. dimBlock: 4 256 1
Routine Bandwidth (GB/s)
copy_scattered 46.99
copy_blocky 47.19
Matrix size: 1024 1024, Block size: 1024 512, Tile size: 1024 1024
dimGrid: 512 1 1. dimBlock: 2 512 1
Routine Bandwidth (GB/s)
copy_scattered 28.67
copy_blocky 27.38
Matrix size: 1024 1024, Block size: 1024 1024, Tile size: 1024 1024
dimGrid: 1024 1 1. dimBlock: 1 1024 1
Routine Bandwidth (GB/s)
copy_scattered 14.94
copy_blocky 14.37
```

The code:

```
#include <stdio.h>
#include <assert.h>
// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
cudaError_t checkCuda(cudaError_t result) {
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
const int TILE_DIM = 1024;
//const int BLOCK_ROWS = 4;
const int NUM_REPS = 100;
// Check errors and print GB/s
void postprocess(const float *ref, const float *res, int n, float ms) {
bool passed = true;
for (int i = 0; i < n; i++)
if (res[i] != ref[i]) {
printf("%d %f %f\n", i, res[i], ref[i]);
printf("%25s\n", "*** FAILED ***");
passed = false;
break;
}
if (passed)
printf("%20.2f\n", 2 * n * sizeof(float) * 1e-6 * NUM_REPS / ms);
}
// simple copy kernel
// Used as reference case representing best effective bandwidth.
__global__ void copy_scattered(float *odata, const float *idata, int BLOCK_ROWS) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * blockDim.x;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y + j) * width + x] = idata[(y + j) * width + x];
}
// simple copy kernel
// Used as reference case representing best effective bandwidth.
__global__ void copy_blocky(float *odata, const float *idata, int BLOCK_ROWS) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y * (TILE_DIM / BLOCK_ROWS);
int width = gridDim.x * blockDim.x;
for (int j = 0; j < TILE_DIM / BLOCK_ROWS; j += 1)
odata[(y + j) * width + x] = idata[(y + j) * width + x];
}
int main(int argc, char **argv) {
const int nx = 1024;
const int ny = 1024;
const int mem_size = nx * ny * sizeof(float);
int devId = 0;
if (argc > 1) devId = atoi(argv[1]);
cudaDeviceProp prop;
checkCuda(cudaGetDeviceProperties(&prop, devId));
printf("\nDevice : %s\n", prop.name);
checkCuda(cudaSetDevice(devId));
float *h_idata = (float *) malloc(mem_size);
float *h_cdata = (float *) malloc(mem_size);
float *h_tdata = (float *) malloc(mem_size);
float *gold = (float *) malloc(mem_size);
float *d_idata, *d_cdata, *d_tdata;
checkCuda(cudaMalloc(&d_idata, mem_size));
checkCuda(cudaMalloc(&d_cdata, mem_size));
checkCuda(cudaMalloc(&d_tdata, mem_size));
for (int BLOCK_ROWS = 1; BLOCK_ROWS <= 1024; BLOCK_ROWS <<= 1) {
dim3 dimGrid(nx / (1024 / BLOCK_ROWS), ny / TILE_DIM, 1);
dim3 dimBlock(1024 / BLOCK_ROWS, BLOCK_ROWS, 1);
printf("Matrix size: %d %d, Block size: %d %d, Tile size: %d %d\n",
nx, ny, TILE_DIM, BLOCK_ROWS, TILE_DIM, TILE_DIM);
printf("dimGrid: %d %d %d. dimBlock: %d %d %d\n",
dimGrid.x, dimGrid.y, dimGrid.z, dimBlock.x, dimBlock.y, dimBlock.z);
// check parameters and calculate execution configuration
if (nx % TILE_DIM || ny % TILE_DIM) {
printf("nx and ny must be a multiple of TILE_DIM\n");
goto error_exit;
}
if (TILE_DIM % BLOCK_ROWS) {
printf("TILE_DIM must be a multiple of BLOCK_ROWS\n");
goto error_exit;
}
// host
for (int j = 0; j < ny; j++)
for (int i = 0; i < nx; i++)
h_idata[j * nx + i] = j * nx + i;
// correct result for error checking
for (int j = 0; j < ny; j++)
for (int i = 0; i < nx; i++)
gold[j * nx + i] = h_idata[i * nx + j];
// device
checkCuda(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
// events for timing
cudaEvent_t startEvent, stopEvent;
checkCuda(cudaEventCreate(&startEvent));
checkCuda(cudaEventCreate(&stopEvent));
float ms;
// ------------
// time kernels
// ------------
printf("%25s%25s\n", "Routine", "Bandwidth (GB/s)");
// ----
// copy scattered
// ----
printf("%25s", "copy_scattered");
checkCuda(cudaMemset(d_cdata, 0, mem_size));
// warm up
copy_scattered<<<dimGrid, dimBlock>>>(d_cdata, d_idata, BLOCK_ROWS);
checkCuda(cudaEventRecord(startEvent, 0));
for (int i = 0; i < NUM_REPS; i++)
copy_scattered<<<dimGrid, dimBlock>>>(d_cdata, d_idata, BLOCK_ROWS);
checkCuda(cudaEventRecord(stopEvent, 0));
checkCuda(cudaEventSynchronize(stopEvent));
checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
checkCuda(cudaMemcpy(h_cdata, d_cdata, mem_size, cudaMemcpyDeviceToHost));
postprocess(h_idata, h_cdata, nx * ny, ms);
// ----
// copy blocky
// ----
printf("%25s", "copy_blocky");
checkCuda(cudaMemset(d_cdata, 0, mem_size));
// warm up
copy_blocky<<<dimGrid, dimBlock>>>(d_cdata, d_idata, BLOCK_ROWS);
checkCuda(cudaEventRecord(startEvent, 0));
for (int i = 0; i < NUM_REPS; i++)
copy_blocky<<<dimGrid, dimBlock>>>(d_cdata, d_idata, BLOCK_ROWS);
checkCuda(cudaEventRecord(stopEvent, 0));
checkCuda(cudaEventSynchronize(stopEvent));
checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
checkCuda(cudaMemcpy(h_cdata, d_cdata, mem_size, cudaMemcpyDeviceToHost));
postprocess(h_idata, h_cdata, nx * ny, ms);
error_exit:
// cleanup
checkCuda(cudaEventDestroy(startEvent));
checkCuda(cudaEventDestroy(stopEvent));
}
checkCuda(cudaFree(d_tdata));
checkCuda(cudaFree(d_cdata));
checkCuda(cudaFree(d_idata));
free(h_idata);
free(h_tdata);
free(h_cdata);
free(gold);
}
```