I am testing with memory coalescence on a program which applies a blur to an image. I found that the version of the program with better coalescence is actually slower.
I am testing the horizontal blur. The blur filter is symmetric, so, in computing a resulting value, it is more efficient, to reduce the amount of multiplications, to access consecutively elements which will be multiplied by the same value (the first and the last, and so on towards the middle of the filter).
I removed the filter and the multiplications to have fewer variables to deal with, but kept the access pattern the same.
If the image is in planar format (first all R values, then G, then B) memory coalescence is way better than if it is in pixel order (RGB, RGB, RGB…) because the values that are to be read at the same time by consecutive threads are consecutive in the image.
And indeed, nvprof confirms that the global memory load and store efficiencies are better.
Yet, the kernel that blurs in planar format is actually slower instead of faster.
Each value in the image is a 2-byte integer.
The code that uses planar format is:
__global__
void kernel1(unsigned short *__restrict__ result, const unsigned short *__restrict__ img, int width, int height, size_t result_pitch, size_t img_pitch, int n) {
int i, j, z, k, l, c, m;
z = blockIdx.x;
i = blockIdx.y * blockDim.y + threadIdx.y;
j = blockIdx.z * blockDim.z + threadIdx.z;
if(i < width && j < height) {
c = 0;
for(k = 0; k < n >> 1; k++) {
l = i + k - n / 2;
m = 0;
if(0 <= l && l < width) {
m = img[(z * height + j) * img_pitch + l];
}
l = i + n - 1 - k - n / 2;
if(0 <= l && l < width) {
m += img[(z * height + j) * img_pitch + l];
}
c += m;
}
l = i + k - n / 2;
if(0 <= l && l < width) {
c += img[(z * height + j) * img_pitch + l];
}
result[(z * height + j) * result_pitch + i] = c / n;
}
}
void blur(int width, int height) {
int i;
size_t img1_pitch, img2_pitch;
unsigned short *__restrict__ img1, *__restrict__ img2;
dim3 blocks(3, (width + 31) / 32, (height + 31) / 32);
dim3 threadsPerBlock(1, 32, 32);
cudaMallocPitch((void **)&img1, &img1_pitch, sizeof(unsigned short) * width, height * 3);
img1_pitch /= sizeof(unsigned short);
cudaMallocPitch((void **)&img2, &img2_pitch, sizeof(unsigned short) * width, height * 3);
img2_pitch /= sizeof(unsigned short);
for(i = 0; i < 1000; i++) {
kernel1 <<<blocks, threadsPerBlock>>> (img2, img1, width, height, img2_pitch, img1_pitch, 17);
}
cudaFree(img1);
cudaFree(img2);
cudaDeviceSynchronize();
}
int main(void) {
clock_t begin, end;
begin = clock();
blur(4096, 4096);
end = clock();
printf("Time: %f", (double)(end - begin) / CLOCKS_PER_SEC);
return 0;
}
The code of the kernel that uses pixel-order is:
__global__
void kernel2(unsigned short *__restrict__ result, const unsigned short *__restrict__ img, int width, int height, size_t result_pitch, size_t img_pitch, int n) {
int i, j, z, k, l, c, m;
z = blockIdx.x;
i = blockIdx.y * blockDim.y + threadIdx.y;
j = blockIdx.z * blockDim.z + threadIdx.z;
if(i < width && j < height) {
c = 0;
for(k = 0; k < n >> 1; k++) {
m = 0;
l = i + k - n / 2;
if(0 <= l && l < width) {
m = img[(j * img_pitch + l * 3) + z];
}
l = i + n - 1 - k - n / 2;
if(0 <= l && l < width) {
m += img[(j * img_pitch + l * 3) + z];
}
c += m;
}
l = i + k - n / 2;
if(0 <= l && l < width) {
c += img[(j * img_pitch + l * 3) + z];
}
result[(j * result_pitch + i * 3) + z] = c / n;
}
}
void blur(int width, int height) {
int i;
size_t img1_pitch, img2_pitch;
unsigned short *__restrict__ img1, *__restrict__ img2;
dim3 blocks(3, (width + 31) / 32, (height + 31) / 32);
dim3 threadsPerBlock(1, 32, 32);
cudaMallocPitch((void **)&img1, &img1_pitch, sizeof(unsigned short) * width * 3, height);
img1_pitch /= sizeof(unsigned short);
cudaMallocPitch((void **)&img2, &img2_pitch, sizeof(unsigned short) * width * 3, height);
img2_pitch /= sizeof(unsigned short);
for(i = 0; i < 1000; i++) {
kernel2 <<<blocks, threadsPerBlock>>> (img2, img1, width, height, img2_pitch, img1_pitch, 17);
}
cudaFree(img1);
cudaFree(img2);
cudaDeviceSynchronize();
}
int main(void) {
clock_t begin, end;
begin = clock();
blur(4096, 4096);
end = clock();
printf("Time: %f", (double)(end - begin) / CLOCKS_PER_SEC);
return 0;
}
The reason the kernel is run multiple times is that it makes the time more meaningful.
Does anyone have a clue as to why the version of the code which is worse at memory coalescence actually runs faster?