So I’m trying to make a simple convolution with an RGB image. Convolution works fine as long as I’m not using shared memory. But with shared memory isn’t working that well. I’m getting just a weird texture.
Long short story. BLOCK_SIZE = 16. SHARED_DIM = 4, so we basically have a 4x4 block.
Can someone look at my code and tell me what’s wrong?
__global__ void GaussianBlur(byte *B, byte *G, byte *R, int rows, int cols, byte *new_B, byte *new_G, byte *new_R) {
int ty = blockIdx.y * blockDim.y + threadIdx.y;
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int kernal_dim = 3;
double kernal[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };
int kernel_offset = kernal_dim / 2;
int image_row = ty;
int image_col = tx;
__shared__ char shared_memR[BLOCK_SIZE];
__shared__ char shared_memG[BLOCK_SIZE];
__shared__ char shared_memB[BLOCK_SIZE];
shared_memR[threadIdx.y * (SHARED_DIM)+threadIdx.x] = R[ty + cols + tx];
shared_memG[threadIdx.y * (SHARED_DIM)+threadIdx.x] = G[ty + cols + tx];
shared_memB[threadIdx.y * (SHARED_DIM)+threadIdx.x] = B[ty + cols + tx];
__syncthreads();
// Ignore border pixels
if (image_row >= kernel_offset && image_row < rows - kernel_offset && image_col >= kernel_offset && image_col < cols - kernel_offset) {
int valueR = 0;
int valueG = 0;
int valueB = 0;
for (int i = 0; i<kernal_dim; ++i) {
int row = (image_row - kernel_offset) + i;
for (int j = 0; j<kernal_dim; ++j) {
int col = (image_col - kernel_offset) + j;
valueR += (kernal[i * kernal_dim + j] * shared_memR[threadIdx.y * (SHARED_DIM) + threadIdx.x])/16;
valueG += (kernal[i * kernal_dim + j] * shared_memG[threadIdx.y * (SHARED_DIM) + threadIdx.x])/16;
valueB += (kernal[i * kernal_dim + j] * shared_memB[threadIdx.y * (SHARED_DIM) + threadIdx.x])/16;
}
}
new_B[image_row * cols + image_col] = (byte)valueB;
new_G[image_row * cols + image_col] = (byte)valueG;
new_R[image_row * cols + image_col] = (byte)valueR;
}
}
check that indexes are within BLOCKSIZE
So it should look something like this:
{
int ty = blockIdx.y * blockDim.y + threadIdx.y;
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int kernal_dim = 3;
double kernal[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };
int kernel_offset = kernal_dim / 2;
int image_row = ty;
int image_col = tx;
int shared_x_offset = tx - kernel_offset;
int shared_y_offset = ty - kernel_offset;
__shared__ char shared_memR[BLOCK_SIZE];
__shared__ char shared_memG[BLOCK_SIZE];
__shared__ char shared_memB[BLOCK_SIZE];
shared_memR[threadIdx.y * (SHARED_DIM)+threadIdx.x] = R[ty + cols + tx];
shared_memG[threadIdx.y * (SHARED_DIM)+threadIdx.x] = G[ty + cols + tx];
shared_memB[threadIdx.y * (SHARED_DIM)+threadIdx.x] = B[ty + cols + tx];
__syncthreads();
// Ignore border pixels
if (image_row >= kernel_offset && image_row < rows - kernel_offset && image_col >= kernel_offset && image_col < cols - kernel_offset) {
int valueR = 0;
int valueG = 0;
int valueB = 0;
for (int i = 0; i<kernal_dim; ++i) {
int row = (image_row - kernel_offset) + i;
int shared_y_pos = shared_y_offset + i;
for (int j = 0; j < kernal_dim; ++j) {
int col = (image_col - kernel_offset) + j;
int shared_x_pos = shared_x_offset + j;
if (shared_x_pos >= 0 && shared_x_pos < SHARED_DIM && shared_y_pos >= 0 && shared_y_pos < SHARED_DIM) {
valueR += (kernal[i * kernal_dim + j] * shared_memR[shared_y_pos * (SHARED_DIM) + shared_x_pos]) / 16;
valueG += (kernal[i * kernal_dim + j] * shared_memG[shared_y_pos * (SHARED_DIM) + shared_x_pos]) / 16;
valueB += (kernal[i * kernal_dim + j] * shared_memB[shared_y_pos * (SHARED_DIM) + shared_x_pos]) / 16;
}
else {
valueR += (kernal[i * kernal_dim + j] * R[row * cols + col])/16;
valueG += (kernal[i * kernal_dim + j] * G[row * cols + col])/16;
valueB += (kernal[i * kernal_dim + j] * B[row * cols + col])/16;
}
}
}
new_B[image_row * cols + image_col] = (byte)valueB;
new_G[image_row * cols + image_col] = (byte)valueG;
new_R[image_row * cols + image_col] = (byte)valueR;
}
}
Cause if it does, I got no speed-up.
i mean that you need to debug your code by checking indexes in debug version and figure out yourself why it goes wrong
I have issues when debugging because of missing .dll files.
If I hadn’t I would’ve done it myself until now.
tera
June 21, 2017, 5:21pm
6
cuda-memcheck is a great tool.
njuffa
June 21, 2017, 5:33pm
7
What tera says. In addition: A few strategic printf() calls is often all that it takes to find an index-out-of-bounds issue, you don’t need a debugger.