Can anyone help me understand performance difference between memCopy2dA and memCopy2dB kernels?

They are supposed to copy 2D data with size xLen,yLen from one place to the other but they are using different strategies:

- when memCopy2dA is used blocks/threads cover whole 2D space since this kernel is suppose to copy only one data point
- when memCopy2dB is used blocks/threads are created only for one whole X row, and then each kernel is looping over Y direction to copy all data.

According to profiler (nvvp) in both cases GPU access memory pattern is 100% and X dimension is big enough to saturate device for “B” kernel (Titan X, 24SM). Unfortunately “B” kernel is slower and on my machine result is:

```
GB/s: 270.715
GB/s: 224.405
```

Additional question: Is it even possible to be close to theoretical memory bandwidth limit which is 336.48 GB/s (3505MHz * 384 bits * 2 / 8)? At least my tests shows max always around 271-272 GB/s.

Test code:

```
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <chrono>
template<typename T>
__global__ void memCopy2dA(T *in, T *out, size_t xLen, size_t yLen) {
int xi = blockIdx.x * blockDim.x + threadIdx.x;
int yi = blockIdx.y * blockDim.y + threadIdx.y;
if (xi < xLen && yi < yLen) {
out[yi * xLen + xi] = in[yi * xLen + xi];
}
}
template<typename T>
__global__ void memCopy2dB(T *in, T *out, size_t xLen, size_t yLen) {
int xi = blockIdx.x * blockDim.x + threadIdx.x;
if (xi < xLen) {
size_t idx = xi;
for (int y = 0; y < yLen; ++y) {
out[idx] = in[idx];
idx += xLen;
}
}
}
static void waitForCuda() {
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(err));
}
int main() {
typedef float T;
size_t xLen = 24 * 32 * 64; //49152
size_t yLen = 1024;
size_t dataSize = xLen * yLen * sizeof(T);
T *dInput;
cudaMalloc(&dInput, dataSize);
T *dOutput;
cudaMalloc(&dOutput, dataSize);
const int numOfRepetitions = 100;
double gigabyte = 1000 * 1000 * 1000;
{
dim3 threadsPerBlock(64, 1);
dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x,
(yLen + threadsPerBlock.y - 1) / threadsPerBlock.y);
auto startTime = std::chrono::high_resolution_clock::now();
for (int i = 0; i < numOfRepetitions; ++i) {
memCopy2dA <<< numBlocks, threadsPerBlock >>> (dInput, dOutput, xLen, yLen);
waitForCuda();
}
auto stopTime = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = stopTime - startTime;
std::cout << "GB/s: " << (2 * dataSize * numOfRepetitions) / elapsed.count() / gigabyte << std::endl;
}
{
dim3 threadsPerBlock(64);
dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x);
auto startTime = std::chrono::high_resolution_clock::now();
for (int i = 0; i < numOfRepetitions; ++i) {
memCopy2dB <<< numBlocks, threadsPerBlock >>> (dInput, dOutput, xLen, yLen);
waitForCuda();
}
auto stopTime = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = stopTime - startTime;
std::cout << "GB/s: " << ((2 * dataSize * numOfRepetitions) / elapsed.count()) / gigabyte << std::endl;
}
cudaFree(dInput);
cudaFree(dOutput);
return 0;
}
```

compiled with:

```
nvcc -std=c++11 memTest.cu -o memTest
```

According to my understanding both kernels are doing nice aligned coalesced memory reading/writing. In kernel “B” each warp has 32 threads working with 4 byte (float) elements doing 128 bytes of RD/WR at a time. Also test data dimensions are “nice” - everything fits perfectly without having idle threads or so. That is why I’m confused.