I’m doing a CUDA program calculating a distance between two image. It uses 7*7 patches around each pixel, and the distance calculation is repeated multiple time (more or less 100 each time).
I tried one version using only global memory, which means that each pixels will do 77100*2=9600 global memory accesses. Then I wrote another version that filled shared memory buffers from global memory (2 global memory accesses per pixel), then is doing the distance calculation from shared memory (9600 shared memory accesses).
At first, I only did the calculation distance on each pixels once, and the shared memory version was slower; it didn’t surprise me much since there was an overhead fom storing the lines in shared memory, and I was reading them only one time, so it was a bit of a waste. Then I started to do the calculations multiple times per pixel, but the shared memory version was still slower.
Global memory version, code below (average kernel duration):
- 10 calculations per pixel: 1.78ms
- 100 calculations per pixel: 17.55ms
- 1000 calculations per pixel: 175.10ms
Shared memory version, code below (average kernel duration):
- 10 calculations per pixel: 2.64ms
- 100 calculations per pixel: 25.44ms
- 1000 calculations per pixel: 253.46ms
And the thing that surprised me is that they both get proportionnally slower for each additional calculations, even though it adds 49 global memory accesses to the first version, and 49 shared memory accesses to the second. I was expecting the shared memory version to get more and more interesting.
Why accessing shared memory is as slow as global memory? Am I using it correctly?
I assume there is multiple shared memory bank conflicts, but I would’ve assumed the shared memory version should still be faster, and I don’t see how I could access the memory in a better way due to the restrictions of my algorithm (= each pixels have to access 7*7 neighborhood) anyway.
Or could it be something related to compiler optimization? Could it be possible to make the shared version faster then?
Something else, really strange, that might be related.
When I remove in each kernels the two lines starting with “err += abs(”, which basically do all the work with the memory accesses, both kernels (global and shared) are 2-3ms slower (19ms and 28ms for 100 calculations per pixel instead of 17ms and 25ms)!
Why would removing memory accesses makes the program slower?
(I’m using a Tegra K1 / Jetson TK1, where the maximum global memory bandwidth is ~12.7Gbps)
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#define REPEAT (100)
/* Perform a L1 distance on a 7*7 neighborhood on a 320*240 image
*
* 240 blocks of 320 threads each
**/
__global__ void scan_ref(unsigned char *d_left, unsigned char *d_right,
unsigned short width /*320*/, unsigned short height /*240*/, unsigned short wradius /*3*/)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (threadIdx.x < wradius || threadIdx.x > width - wradius - 1
|| blockIdx.x < wradius || blockIdx.x > height - wradius - 1)
return;
int err = 0;
for (int r = 0; r < REPEAT; ++r) {
// for each pixels, 7*7*REPEAT*2 = 9800 global memory accesses
for (short j = -wradius; j < wradius + 1; ++j) {
for (short i = -wradius; i < wradius + 1; ++i) {
err += abs(d_right[id + (j*width)+i] - d_left[id + (j*width)+i]);
}
}
}
//validation[id] = err;
}
/* Perform a L1 distance on a 7*7 neighborhood on a 320*240 image with shared memory buffers
*
* 4 blocks (or more, doesn't change the current problem) of 320 threads each; the image's height is automatically
* split to match the number of blocks. (4 blocks = 60+6 lines per block)
* Each blocks first get the first 6 (if windows radius = 3) rows of his chunk,
* then, for each lines of the chunk, only one new line is loaded to shared memory
* and the L1 distance is done using the two (left and right) shared memory
* buffers instead of global memory direct accesses.
**/
__global__ void scan_shr(unsigned char *d_left, unsigned char *d_right,
unsigned short width /*320*/, unsigned short height /*240*/, unsigned short wradius /*3*/)
{
int tid = threadIdx.x;
int chunk = height / gridDim.x;
int gid = blockIdx.x * blockDim.x + tid;
__shared__ unsigned char line_l[320*7]; //store 7 lines from left and right in shared memory
__shared__ unsigned char line_r[320*7];
for (int i = 0; i < wradius * 2; ++i) {
// load the first 6 lines of the block
line_l[tid + width*(i+1)] = d_left[chunk*blockIdx.x*width + tid + width*i];
line_r[tid + width*(i+1)] = d_right[chunk*blockIdx.x*width + tid + width*i];
}
for (int i = chunk * blockIdx.x + wradius;
i < chunk * (blockIdx.x + 1) - wradius + (blockIdx.x != gridDim.x - 1 ? wradius * 2 : 0); ++i) {
for (int k = 0; k < wradius * 2; ++k) {
// shift the lines 1-6 to 0-5
line_l[(k*width) + tid] = line_l[width + (k*width) + tid];
line_r[(k*width) + tid] = line_r[width + (k*width) + tid];
}
gid = i * width + tid;
// and fill the 6th line with new data
line_l[tid + width*wradius*2] = d_left[gid + width*wradius];
line_r[tid + width*wradius*2] = d_right[gid + width*wradius];
__syncthreads();
if (tid >= wradius && tid <= width - wradius - 1) {
int err = 0;
for (int r = 0; r < REPEAT; ++r) {
// for each pixels, 2 global memory accesses + 7*7*REPEAT*2(+26) = 9800 shared memory accesses
for (short j = -wradius; j < wradius + 1; ++j) {
for (short i = -wradius; i < wradius + 1; ++i) {
err += abs(line_r[width*3 + tid + (j*width)+i] - line_l[width*3 + tid + (j*width)+i]);
}
}
}
//validation[gid] = err;
}
}
}
int main(void) {
const size_t size = sizeof(unsigned char) * 320 * 240;
unsigned char *d_left, *d_right;
unsigned char *h_left, *h_right;
h_left = (unsigned char*)malloc(size);
h_right = (unsigned char*)malloc(size);
cudaMalloc(&d_left, size);
cudaMalloc(&d_right, size);
for (int k = 0; k < 20; ++k) {
srand(101909);
for (int i = 0; i < 320 * 240; ++i) {
h_left[i] = (!(rand() % 5) ? rand() % 256 : 0);
h_right[i] = (!(rand() % 5) ? rand() % 256 : 0);
}
cudaMemcpy(d_left, h_left, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_right, h_right, size, cudaMemcpyHostToDevice);
scan_ref<<<240, 320>>>(d_left, d_right, 320, 240, 3);
scan_shr<<<4, 320>>>(d_left, d_right, 320, 240, 3);
}
cudaFree(d_left);
cudaFree(d_right);
return (0);
}