Convolution Texture with Shared Memory

I’m looking a the CUDA example for convolution with texture and am wondering how can I implement shared memory for this example?

So far I updated the call to convolutionColumnsKernel to accept shared memory.

extern "C" void convolutionColumnsGPU(float* d_Dst, cudaArray * a_Src,
	int imageW, int imageH,
	cudaTextureObject_t texSrc) {
	dim3 threads(16, 12);
	dim3 blocks(iDivUp(imageW, threads.x), iDivUp(imageH, threads.y));

	int shared_memory_size = (threads.x + 2 * KERNEL_RADIUS) * threads.y * sizeof(float);

	convolutionColumnsKernel << <blocks, threads, shared_memory_size >> > (d_Dst, imageW, imageH, texSrc);
	getLastCudaError("convolutionColumnsKernel() execution failed\n");
}

my attempt to use shared memory looks like

__global__ void convolutionColumnsKernel(float* d_Dst, int imageW, int imageH, cudaTextureObject_t texSrc) {
	const int ix = IMAD(blockDim.x, blockIdx.x, threadIdx.x);
	const int iy = IMAD(blockDim.y, blockIdx.y, threadIdx.y);
	const float x = (float)ix + 0.5f;
	const float y = (float)iy + 0.5f;

	if (ix >= imageW || iy >= imageH) {
		return;
	}

	// Allocate shared memory dynamically
	extern __shared__ float shared_Data_Column[];

	int col = threadIdx.x + KERNEL_RADIUS;

	if (col < imageW) {
		shared_Data_Column[col] = tex2D<float>(texSrc, x, y);
	}

	__syncthreads();

	float sum = 0;

#if (UNROLL_INNER) // This is set to 0
	sum = convolutionColumn<2 * KERNEL_RADIUS>(x, y, texSrc);
#else

	for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) {
		int shared_col = threadIdx.x + k + KERNEL_RADIUS;
		sum += shared_Data_Column[shared_col] * c_Kernel[KERNEL_RADIUS - k];
	}

#endif

	d_Dst[IMAD(iy, imageW, ix)] = sum;
}

However this does not match the CPU implementation so something is wrong and I think it’s how I’m summarizing the final summation.

	for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) {
		int shared_col = threadIdx.x + k + KERNEL_RADIUS;
		sum += shared_Data_Column[shared_col] * c_Kernel[KERNEL_RADIUS - k];
	}

Can anybody spot the issue for me?

One problem is that you only are indicating (i.e. indexing) enough shared usage for a single row, but your threadblock has multiple rows (threads in y) in it.

You allocate enough shared storage for the whole threadblock:

But your storage and usage is treating all rows as if they are the same row:

Considering just that line above, you have a threadblock of 16x12 threads. 16 threads in x, 12 in y. When the threads whose threadIdx.y value is zero load that area of shared memory, where will the threads whose threadIdx.y is one load their data? They will load it into exactly the same spots as the locations used by the first row.

You need to refactor your code to envision shared usage for the entire block. The straightforward way to do this is to allocate a 2D storage for shared memory, but the easy way to do this is with statically allocated shared memory. For dynamically allocated, we need to do indexing calculations. Something like this:

int col = threadIdx.x + KERNEL_RADIUS;
int sidx = threadIdx.y*(blockDim.x+2*KERNEL_RADIUS)+col
if (col < imageW) {
	shared_Data_Column[sidx] = tex2D<float>(texSrc, x, y);
}

And you’ll need to similarly refactor your usage later to account for 2D access.

You’ll also need a loop or some additional work to load shared memory. You have a thread array of 16x12 threads per block, and the line of code I offered:

if (col < imageW) {
	shared_Data_Column[sidx] = tex2D<float>(texSrc, x, y);
}

will load one element per thread. So if you only needed to load 16x12 elements, that would be enough. But because of the convolution apron, you have storage for not 16x12 but something like 20x12 elements. 16x12 threads will not be enough to load 20x12 elements, if each thread only loads one element. So you will need some sort of loop, or additional load operations, to load the “apron” elements. You might be able to borrow some ideas from your computation loop.

Thank you for responding, it’s really appreciated - I was able to get this to match the CPU implementation, but after profiling it looks like some of the memory access is not coalesced, specifically on my row convolution implementation.

__global__ void convolutionRowsKernel(float* d_Dst, int imageW, int imageH,
    cudaTextureObject_t texSrc) {
    const int ix = IMAD(blockDim.x, blockIdx.x, threadIdx.x);
    const int iy = IMAD(blockDim.y, blockIdx.y, threadIdx.y);
    const float x = (float)ix + 0.5f;
    const float y = (float)iy + 0.5f;

    if (ix >= imageW || iy >= imageH) {
        return;
    }

    // Shared memory
    extern __shared__ float sharedData[];

    // Load data from source in the bounds of the kernel into shared memory
    for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) {
        int offset = (threadIdx.x + k + KERNEL_RADIUS) * blockDim.x + threadIdx.y;
        sharedData[offset] = tex2D<float>(texSrc, x + (float)k, y);
    }

    __syncthreads();

    float sum = 0;

#if (UNROLL_INNER) // ignoring this if for favor or shared memory approach for now
    sum = convolutionRow<2 * KERNEL_RADIUS>(x, y, texSrc);
#else

    for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) {
        int d = ix + k;

        if (d < 0) {
            d = 0;
        }
        else if (d >= imageW) {
            d = imageW - 1;
        }

        // Boundary conditions
        if (d >= 0 && d < imageW) {
            int offset = (threadIdx.x + k + KERNEL_RADIUS) * blockDim.x + threadIdx.y;

            sum += sharedData[offset] * c_Kernel[KERNEL_RADIUS - k];
        }
    }

#endif

    d_Dst[IMAD(iy, imageW, ix)] = sum;
}

extern "C" void convolutionRowsGPU(float* d_Dst, cudaArray * a_Src, int imageW,
    int imageH, cudaTextureObject_t texSrc) {
    dim3 threads(16, 12);
    dim3 blocks(iDivUp(imageW, threads.x), iDivUp(imageH, threads.y));

    float sharedDataSize = imageW * sizeof(float);

    convolutionRowsKernel << <blocks, threads, sharedDataSize >> > (d_Dst, imageW, imageH, texSrc);
    getLastCudaError("convolutionRowsKernel() execution failed\n");
}

My calculations for the offset

for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) {
    int offset = (threadIdx.x + k + KERNEL_RADIUS) * blockDim.x + threadIdx.y;
    sharedData[offset] = tex2D<float>(texSrc, x + (float)k, y);
 }

to use are not sequential with values following the pattern 0, 16, 32 since we’re going off the blockDim.x value.

What should these offset values be for coalesced memory access? If I use the same offset as the column convolution, it’s a sequence number from 0, 1, 2, … however the final output does not match the CPU implementation anymore so I’m a bit confused on how to make memory access coalesced and also work properly?

Column convolution as reference:

__global__ void convolutionColumnsKernel(float* d_Dst, int imageW, int imageH,
    cudaTextureObject_t texSrc) {
    const int ix = IMAD(blockDim.x, blockIdx.x, threadIdx.x);
    const int iy = IMAD(blockDim.y, blockIdx.y, threadIdx.y);
    const float x = (float)ix + 0.5f;
    const float y = (float)iy + 0.5f;

    if (ix >= imageW || iy >= imageH) {
        return;
    }

    // Shared memory
    extern __shared__ float sharedData[];

    // Load data from source in the bounds of the kernel into shared memory
    for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) {
        int offset = (threadIdx.y + k + KERNEL_RADIUS) * blockDim.x + threadIdx.x;
        sharedData[offset] = tex2D<float>(texSrc, x, y + (float)k);
    }

    __syncthreads();

    float sum = 0;

#if (UNROLL_INNER)
    sum = convolutionColumn<2 * KERNEL_RADIUS>(x, y, texSrc);
#else

    for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) {
        int d = iy + k;

        if (d < 0) {
            d = 0;
        } else if (d >= imageH) {
            d = imageH - 1;
        }

        // Boundary conditions
        if (d >=0 && d < imageH) {
            int offset = (threadIdx.y + k + KERNEL_RADIUS) * blockDim.x + threadIdx.x;

            sum += sharedData[offset] * c_Kernel[KERNEL_RADIUS - k];
        }
    }

#endif

    d_Dst[IMAD(iy, imageW, ix)] = sum;
}

extern "C" void convolutionColumnsGPU(float* d_Dst, cudaArray * a_Src,
    int imageW, int imageH,
    cudaTextureObject_t texSrc) {
    dim3 threads(16, 12);
    dim3 blocks(iDivUp(imageW, threads.x), iDivUp(imageH, threads.y));

    float sharedDataSize = imageH * sizeof(float);

    convolutionColumnsKernel << <blocks, threads, sharedDataSize >> > (d_Dst, imageW, imageH, texSrc);
    getLastCudaError("convolutionColumnsKernel() execution failed\n");
}

If you want accesses to coalesce, then a general principle is that you should not multiply threadIdx.x by anything when creating the index. You are violating this principle. You are effectively doing columnar access, which is canonically bad on a CUDA GPU.

You’ll note that the offset you created for your column convolution kernel satisfies this principle.

It may be possible to refactor your code to use e.g. shared memory to work around columnar access, but I know of no simple “arithmetic” to convert columnar access to row access, while still preserving the intent of the code.