Thank you for responding, it’s really appreciated - I was able to get this to match the CPU implementation, but after profiling it looks like some of the memory access is not coalesced, specifically on my row convolution implementation.

```
__global__ void convolutionRowsKernel(float* d_Dst, int imageW, int imageH,
cudaTextureObject_t texSrc) {
const int ix = IMAD(blockDim.x, blockIdx.x, threadIdx.x);
const int iy = IMAD(blockDim.y, blockIdx.y, threadIdx.y);
const float x = (float)ix + 0.5f;
const float y = (float)iy + 0.5f;
if (ix >= imageW || iy >= imageH) {
return;
}
// Shared memory
extern __shared__ float sharedData[];
// Load data from source in the bounds of the kernel into shared memory
for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) {
int offset = (threadIdx.x + k + KERNEL_RADIUS) * blockDim.x + threadIdx.y;
sharedData[offset] = tex2D<float>(texSrc, x + (float)k, y);
}
__syncthreads();
float sum = 0;
#if (UNROLL_INNER) // ignoring this if for favor or shared memory approach for now
sum = convolutionRow<2 * KERNEL_RADIUS>(x, y, texSrc);
#else
for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) {
int d = ix + k;
if (d < 0) {
d = 0;
}
else if (d >= imageW) {
d = imageW - 1;
}
// Boundary conditions
if (d >= 0 && d < imageW) {
int offset = (threadIdx.x + k + KERNEL_RADIUS) * blockDim.x + threadIdx.y;
sum += sharedData[offset] * c_Kernel[KERNEL_RADIUS - k];
}
}
#endif
d_Dst[IMAD(iy, imageW, ix)] = sum;
}
extern "C" void convolutionRowsGPU(float* d_Dst, cudaArray * a_Src, int imageW,
int imageH, cudaTextureObject_t texSrc) {
dim3 threads(16, 12);
dim3 blocks(iDivUp(imageW, threads.x), iDivUp(imageH, threads.y));
float sharedDataSize = imageW * sizeof(float);
convolutionRowsKernel << <blocks, threads, sharedDataSize >> > (d_Dst, imageW, imageH, texSrc);
getLastCudaError("convolutionRowsKernel() execution failed\n");
}
```

My calculations for the offset

```
for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) {
int offset = (threadIdx.x + k + KERNEL_RADIUS) * blockDim.x + threadIdx.y;
sharedData[offset] = tex2D<float>(texSrc, x + (float)k, y);
}
```

to use are not sequential with values following the pattern 0, 16, 32 since we’re going off the `blockDim.x`

value.

What should these offset values be for coalesced memory access? If I use the same offset as the column convolution, it’s a sequence number from 0, 1, 2, … however the final output does not match the CPU implementation anymore so I’m a bit confused on how to make memory access coalesced and also work properly?

Column convolution as reference:

```
__global__ void convolutionColumnsKernel(float* d_Dst, int imageW, int imageH,
cudaTextureObject_t texSrc) {
const int ix = IMAD(blockDim.x, blockIdx.x, threadIdx.x);
const int iy = IMAD(blockDim.y, blockIdx.y, threadIdx.y);
const float x = (float)ix + 0.5f;
const float y = (float)iy + 0.5f;
if (ix >= imageW || iy >= imageH) {
return;
}
// Shared memory
extern __shared__ float sharedData[];
// Load data from source in the bounds of the kernel into shared memory
for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) {
int offset = (threadIdx.y + k + KERNEL_RADIUS) * blockDim.x + threadIdx.x;
sharedData[offset] = tex2D<float>(texSrc, x, y + (float)k);
}
__syncthreads();
float sum = 0;
#if (UNROLL_INNER)
sum = convolutionColumn<2 * KERNEL_RADIUS>(x, y, texSrc);
#else
for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) {
int d = iy + k;
if (d < 0) {
d = 0;
} else if (d >= imageH) {
d = imageH - 1;
}
// Boundary conditions
if (d >=0 && d < imageH) {
int offset = (threadIdx.y + k + KERNEL_RADIUS) * blockDim.x + threadIdx.x;
sum += sharedData[offset] * c_Kernel[KERNEL_RADIUS - k];
}
}
#endif
d_Dst[IMAD(iy, imageW, ix)] = sum;
}
extern "C" void convolutionColumnsGPU(float* d_Dst, cudaArray * a_Src,
int imageW, int imageH,
cudaTextureObject_t texSrc) {
dim3 threads(16, 12);
dim3 blocks(iDivUp(imageW, threads.x), iDivUp(imageH, threads.y));
float sharedDataSize = imageH * sizeof(float);
convolutionColumnsKernel << <blocks, threads, sharedDataSize >> > (d_Dst, imageW, imageH, texSrc);
getLastCudaError("convolutionColumnsKernel() execution failed\n");
}
```