Static shared memory Tiling (performance optimization)

Good evening,

I am working on accelerating some CUDA DEVICE kernel first_kernel in the following code using static shared memory and am having some issues in that the results in the new optimized_kernel do not match the results from the first_kernel results. For reference I am using CUDA Toolkit v12.04 with compute capability 8.0. I have configured the kernel memory to execute both of these kernels as the following:

const int block_width = 512;
dim3 blockSize(block_width, 1, 1);
dim3 gridSize(1, block_width, 1);

The first_kernel and optimized_kernel CUDA code follows. I suspect that the issue is that somehow the optimized_kernel is counting particular indices during the accumulation stage of the code, but I can’t seem to figure it out. Any help or hints would be greatly appreciated.

Many thanks.

#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <stdexcept>
#include <chrono>
#include <cstdlib>
#include <vector>
#include <algorithm>

void checkCudaError(cudaError_t result, const char *file, int line, bool abort = true) {
    if (result != cudaSuccess) {
        std::cerr << "CUDA Error: " << cudaGetErrorString(result) << " at " << file << ":"
                  << line << std::endl;
        if (abort) {
            exit(result);
        }
    }
}

// Helper macro to capture file and line information automatically
#define GPU_ERROR_CHECK(result) checkCudaError(result, __FILE__, __LINE__)

__forceinline__ __host__ __device__ double dot(double3 a, double3 b) {
	return a.x * b.x + a.y * b.y + a.z * b.z;
}

__forceinline__ __host__ __device__ double length(double3 v) {
	return sqrt(dot(v, v));
}

__forceinline__ __host__ __device__ double2 myKernel(double val) {
	return { val, val / 2 };
}
__global__ void first_kernel(const double3 * __restrict__ pos_a, const double3 * __restrict__ pos_b, 
                           const int input_size, const int output_size, double2 * __restrict__ result,
                           const int region, const int first_idx_x, const int last_idx_x) {
    // Compute indices
    int thridx_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;
    int thridx_y = threadIdx.y + blockDim.y * blockIdx.y;
    int stride_x = blockDim.x * gridDim.x;
    int stride_y = blockDim.y * gridDim.y;

    double3 distance3 = { 0.0, 0.0, 0.0 };
    double distance = 0.0;

    int output_start, output_end;

    for(int x = thridx_x; x < last_idx_x; x += stride_x){
        // Distance calculations
        distance3.x = pos_a[x].x - pos_b[x].x;
        distance3.y = pos_a[x].y - pos_b[x].y;
        distance3.z = pos_a[x].z - pos_b[x].z;

        distance = length(distance3);
        output_start = __ddiv_rz(output_size, 2) + __ddiv_rz(distance, output_size) - region;
        output_end = output_start + region;
        
        for(int y = thridx_y; y < output_size; y += stride_y){
            if((y < output_end) && (y >= output_start)) {
                double2 lval = myKernel(1.0);
                atomicAdd(&result[y].x, lval.x);
                atomicAdd(&result[y].y, lval.y);
            }                        
        }
    }
}

__global__ void optimized_kernel(const double3* __restrict__ pos_a, const double3* __restrict__ pos_b,
                                        const int input_size, const int output_size, double2* __restrict__ result,
                                        const int region, const int first_idx_x, const int last_idx_x) {
    const int TILE_SIZE = 512; // Define the size of the tile
    __shared__ double3 shared_pos_a[TILE_SIZE];
    __shared__ double3 shared_pos_b[TILE_SIZE];

    // Calculate global index for the x-dimension
    int global_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;

    // Process tiles in the range defined by first_idx_x and last_idx_x
    for (int tile_x = global_x; tile_x < last_idx_x; tile_x += TILE_SIZE) {
        // Load the tile into shared memory
        if (tile_x + threadIdx.x < input_size) {
            shared_pos_a[threadIdx.x] = pos_a[tile_x + threadIdx.x];
            shared_pos_b[threadIdx.x] = pos_b[tile_x + threadIdx.x];
        }
        __syncthreads();

        // Each thread computes its contribution
        for (int i = 0; i < TILE_SIZE && (tile_x + i) < input_size; i++) {
            double3 distance3 = {
                shared_pos_a[threadIdx.x].x - shared_pos_b[i].x,
                shared_pos_a[threadIdx.x].y - shared_pos_b[i].y,
                shared_pos_a[threadIdx.x].z - shared_pos_b[i].z
            };
            double distance = length(distance3);

            // Calculate output range based on distance
            int output_start = max(0, static_cast<int>(__ddiv_rz(output_size, 2) + __ddiv_rz(distance, output_size) - region));
            int output_end = min(output_size, output_start + region);

            // Update results for the valid output range
            for (int y = output_start; y < output_end; y++) {
                if (y < output_size) { // Prevent going out of bounds
                    double2 lval = myKernel(1.0); // Replace with your kernel logic

                    // Ensure only one thread updates each result index
                    if (threadIdx.x == 0) {
                        atomicAdd(&result[y].x, lval.x);
                        atomicAdd(&result[y].y, lval.y);
                    }
                }
            }
            __syncthreads(); // Synchronize threads before the next tile processing
        }
    }
}