Shared memory out of bounds (simple convolution)

So I’m trying to make a simple convolution with an RGB image. Convolution works fine as long as I’m not using shared memory. But with shared memory isn’t working that well. I’m getting just a weird texture.

Long short story. BLOCK_SIZE = 16. SHARED_DIM = 4, so we basically have a 4x4 block.

Can someone look at my code and tell me what’s wrong?

__global__ void GaussianBlur(byte *B, byte *G, byte *R, int rows, int cols, byte *new_B, byte *new_G, byte *new_R) {

	int ty = blockIdx.y * blockDim.y + threadIdx.y;
	int tx = blockIdx.x * blockDim.x + threadIdx.x;

	int kernal_dim = 3;

	double kernal[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };

	int kernel_offset = kernal_dim / 2;

	int image_row = ty;
	int image_col = tx;

		__shared__ char shared_memR[BLOCK_SIZE];
		__shared__ char shared_memG[BLOCK_SIZE];
		__shared__ char shared_memB[BLOCK_SIZE];

		shared_memR[threadIdx.y * (SHARED_DIM)+threadIdx.x] = R[ty + cols + tx];
		shared_memG[threadIdx.y * (SHARED_DIM)+threadIdx.x] = G[ty + cols + tx];
		shared_memB[threadIdx.y * (SHARED_DIM)+threadIdx.x] = B[ty + cols + tx];

		__syncthreads();

	// Ignore border pixels
	if (image_row >= kernel_offset && image_row < rows - kernel_offset && image_col >= kernel_offset && image_col < cols - kernel_offset) {

		int valueR = 0;
		int valueG = 0;
		int valueB = 0;

		for (int i = 0; i<kernal_dim; ++i) {
			int row = (image_row - kernel_offset) + i;

			for (int j = 0; j<kernal_dim; ++j) {
				int col = (image_col - kernel_offset) + j;
				valueR += (kernal[i * kernal_dim + j] * shared_memR[threadIdx.y * (SHARED_DIM) + threadIdx.x])/16;
				valueG += (kernal[i * kernal_dim + j] * shared_memG[threadIdx.y * (SHARED_DIM) + threadIdx.x])/16;
				valueB += (kernal[i * kernal_dim + j] * shared_memB[threadIdx.y * (SHARED_DIM) + threadIdx.x])/16;
			}
		}
		new_B[image_row * cols + image_col] = (byte)valueB;
		new_G[image_row * cols + image_col] = (byte)valueG;
		new_R[image_row * cols + image_col] = (byte)valueR;
	}
}

check that indexes are within BLOCKSIZE

So it should look something like this:

{
int ty = blockIdx.y * blockDim.y + threadIdx.y;
	int tx = blockIdx.x * blockDim.x + threadIdx.x;

	int kernal_dim = 3;

	double kernal[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };

	int kernel_offset = kernal_dim / 2;

	int image_row = ty;
	int image_col = tx;

	int shared_x_offset = tx - kernel_offset;
	int shared_y_offset = ty - kernel_offset;

		__shared__ char shared_memR[BLOCK_SIZE];
		__shared__ char shared_memG[BLOCK_SIZE];
		__shared__ char shared_memB[BLOCK_SIZE];

		shared_memR[threadIdx.y * (SHARED_DIM)+threadIdx.x] = R[ty + cols + tx];
		shared_memG[threadIdx.y * (SHARED_DIM)+threadIdx.x] = G[ty + cols + tx];
		shared_memB[threadIdx.y * (SHARED_DIM)+threadIdx.x] = B[ty + cols + tx];

		__syncthreads();

	// Ignore border pixels
	if (image_row >= kernel_offset && image_row < rows - kernel_offset && image_col >= kernel_offset && image_col < cols - kernel_offset) {

		int valueR = 0;
		int valueG = 0;
		int valueB = 0;

		for (int i = 0; i<kernal_dim; ++i) {
			int row = (image_row - kernel_offset) + i;
			int shared_y_pos = shared_y_offset + i;

			for (int j = 0; j < kernal_dim; ++j) {
				int col = (image_col - kernel_offset) + j;
				int shared_x_pos = shared_x_offset + j;
				if (shared_x_pos >= 0 && shared_x_pos < SHARED_DIM && shared_y_pos >= 0 && shared_y_pos < SHARED_DIM) {

					valueR += (kernal[i * kernal_dim + j] * shared_memR[shared_y_pos * (SHARED_DIM) + shared_x_pos]) / 16;
					valueG += (kernal[i * kernal_dim + j] * shared_memG[shared_y_pos * (SHARED_DIM) + shared_x_pos]) / 16;
					valueB += (kernal[i * kernal_dim + j] * shared_memB[shared_y_pos * (SHARED_DIM) + shared_x_pos]) / 16;
				}
				else {
						valueR += (kernal[i * kernal_dim + j] * R[row * cols + col])/16;
						valueG += (kernal[i * kernal_dim + j] * G[row * cols + col])/16;
						valueB += (kernal[i * kernal_dim + j] * B[row * cols + col])/16;
				}
			}
		}
		new_B[image_row * cols + image_col] = (byte)valueB;
		new_G[image_row * cols + image_col] = (byte)valueG;
		new_R[image_row * cols + image_col] = (byte)valueR;
	}
}

Cause if it does, I got no speed-up.

i mean that you need to debug your code by checking indexes in debug version and figure out yourself why it goes wrong

I have issues when debugging because of missing .dll files.

If I hadn’t I would’ve done it myself until now.

cuda-memcheck is a great tool.

What tera says. In addition: A few strategic printf() calls is often all that it takes to find an index-out-of-bounds issue, you don’t need a debugger.