Good evening,
I am working on accelerating some CUDA DEVICE kernel first_kernel
in the following code using static shared memory and am having some issues in that the results in the new optimized_kernel
do not match the results from the first_kernel
results. For reference I am using CUDA Toolkit v12.04
with compute capability 8.0
. I have configured the kernel memory to execute both of these kernels as the following:
const int block_width = 512;
dim3 blockSize(block_width, 1, 1);
dim3 gridSize(1, block_width, 1);
The first_kernel
and optimized_kernel
CUDA code follows. I suspect that the issue is that somehow the optimized_kernel
is counting particular indices during the accumulation stage of the code, but I can’t seem to figure it out. Any help or hints would be greatly appreciated.
Many thanks.
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <stdexcept>
#include <chrono>
#include <cstdlib>
#include <vector>
#include <algorithm>
void checkCudaError(cudaError_t result, const char *file, int line, bool abort = true) {
if (result != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(result) << " at " << file << ":"
<< line << std::endl;
if (abort) {
exit(result);
}
}
}
// Helper macro to capture file and line information automatically
#define GPU_ERROR_CHECK(result) checkCudaError(result, __FILE__, __LINE__)
__forceinline__ __host__ __device__ double dot(double3 a, double3 b) {
return a.x * b.x + a.y * b.y + a.z * b.z;
}
__forceinline__ __host__ __device__ double length(double3 v) {
return sqrt(dot(v, v));
}
__forceinline__ __host__ __device__ double2 myKernel(double val) {
return { val, val / 2 };
}
__global__ void first_kernel(const double3 * __restrict__ pos_a, const double3 * __restrict__ pos_b,
const int input_size, const int output_size, double2 * __restrict__ result,
const int region, const int first_idx_x, const int last_idx_x) {
// Compute indices
int thridx_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;
int thridx_y = threadIdx.y + blockDim.y * blockIdx.y;
int stride_x = blockDim.x * gridDim.x;
int stride_y = blockDim.y * gridDim.y;
double3 distance3 = { 0.0, 0.0, 0.0 };
double distance = 0.0;
int output_start, output_end;
for(int x = thridx_x; x < last_idx_x; x += stride_x){
// Distance calculations
distance3.x = pos_a[x].x - pos_b[x].x;
distance3.y = pos_a[x].y - pos_b[x].y;
distance3.z = pos_a[x].z - pos_b[x].z;
distance = length(distance3);
output_start = __ddiv_rz(output_size, 2) + __ddiv_rz(distance, output_size) - region;
output_end = output_start + region;
for(int y = thridx_y; y < output_size; y += stride_y){
if((y < output_end) && (y >= output_start)) {
double2 lval = myKernel(1.0);
atomicAdd(&result[y].x, lval.x);
atomicAdd(&result[y].y, lval.y);
}
}
}
}
__global__ void optimized_kernel(const double3* __restrict__ pos_a, const double3* __restrict__ pos_b,
const int input_size, const int output_size, double2* __restrict__ result,
const int region, const int first_idx_x, const int last_idx_x) {
const int TILE_SIZE = 512; // Define the size of the tile
__shared__ double3 shared_pos_a[TILE_SIZE];
__shared__ double3 shared_pos_b[TILE_SIZE];
// Calculate global index for the x-dimension
int global_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;
// Process tiles in the range defined by first_idx_x and last_idx_x
for (int tile_x = global_x; tile_x < last_idx_x; tile_x += TILE_SIZE) {
// Load the tile into shared memory
if (tile_x + threadIdx.x < input_size) {
shared_pos_a[threadIdx.x] = pos_a[tile_x + threadIdx.x];
shared_pos_b[threadIdx.x] = pos_b[tile_x + threadIdx.x];
}
__syncthreads();
// Each thread computes its contribution
for (int i = 0; i < TILE_SIZE && (tile_x + i) < input_size; i++) {
double3 distance3 = {
shared_pos_a[threadIdx.x].x - shared_pos_b[i].x,
shared_pos_a[threadIdx.x].y - shared_pos_b[i].y,
shared_pos_a[threadIdx.x].z - shared_pos_b[i].z
};
double distance = length(distance3);
// Calculate output range based on distance
int output_start = max(0, static_cast<int>(__ddiv_rz(output_size, 2) + __ddiv_rz(distance, output_size) - region));
int output_end = min(output_size, output_start + region);
// Update results for the valid output range
for (int y = output_start; y < output_end; y++) {
if (y < output_size) { // Prevent going out of bounds
double2 lval = myKernel(1.0); // Replace with your kernel logic
// Ensure only one thread updates each result index
if (threadIdx.x == 0) {
atomicAdd(&result[y].x, lval.x);
atomicAdd(&result[y].y, lval.y);
}
}
}
__syncthreads(); // Synchronize threads before the next tile processing
}
}
}