Okay. I think I got the code to work using shared memory for increasing array input sizes. The performance boost is noticeable for any array input, i.e., n_values
of 8092 I get approximately 1.25x faster. However, for larger arrays the performance is either the same or worse.
Does anyone have any suggestions for boosting execution speed of mod_kernel
? I have even been toying with the idea of giving CUDA CUB
a try. The complete code follows:
#include <cuda.h>
#include <chrono>
#include <stdio.h>
#include <string>
#include <sstream>
#include <vector>
#include <stdexcept>
#include <cstdlib>
#include <cmath>
#define GPU_ERROR_CHECK(ans) do{gpuAssert((ans),__FILE__,__LINE__);}while(0)
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
printf("\nCUDA KERNEL ERROR: CUDA Kernel reports error: %s\n",cudaGetErrorString(code));
if (abort) exit(code);
}
}
inline __host__ __device__ float dot(float3 a, float3 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
inline __host__ __device__ float length(float3 v)
{
return sqrtf(dot(v, v));
}
inline __device__ float2 myKernel(float val) {
return make_float2(val, val / 2);
}
__global__ void mod_kernel(const float3 * pos_a, const float3 * pos_b,
const uint32_t input_size, const uint32_t output_size,
float2 * result, const int region,
const uint32_t first_idx_x, const uint32_t last_idx_x)
{
extern __shared__ float3 shared_mem[];
float3* shared_pos_a = shared_mem;
float3* shared_pos_b = shared_mem + blockDim.x;
uint32_t thridx_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;
uint32_t thridx_y = threadIdx.y + blockDim.y * blockIdx.y;
uint32_t stride_x = blockDim.x * gridDim.x;
uint32_t stride_y = blockDim.y * gridDim.y;
float3 distance3 = make_float3(0.0f, 0.0f, 0.0f);
float distance = 0;
uint32_t output_start, output_end;
if (thridx_x < input_size) {
// Load data into shared memory
shared_pos_a[threadIdx.x] = pos_a[thridx_x];
shared_pos_b[threadIdx.x] = pos_b[thridx_x];
}
__syncthreads();
for(uint32_t x = thridx_x; x < last_idx_x; x += stride_x){
if (x >= input_size) break;
// distance calcs
distance3.x = shared_pos_a[threadIdx.x].x - shared_pos_b[threadIdx.x].x;
distance3.y = shared_pos_a[threadIdx.x].y - shared_pos_b[threadIdx.x].y;
distance3.z = shared_pos_a[threadIdx.x].z - shared_pos_b[threadIdx.x].z;
distance = length(distance3);
output_start = __fdiv_rz(output_size, 2) + __fdiv_rz(distance, output_size) - region;
output_end = output_start + region;
for(uint32_t y = thridx_y; y < output_size; y += stride_y){
float val = 1;
if((y < output_end) && (y >= output_start)){
float2 lval = myKernel(val);
atomicAdd(&result[y].x, lval.x);
atomicAdd(&result[y].y, lval.y);
}
}
__syncthreads();
}
}
__global__ void org_kernel(const float3 * pos_a, const float3 * pos_b,
const uint32_t input_size, const uint32_t output_size,
float2 * result, const int region,
const uint32_t first_idx_x, const uint32_t last_idx_x)
{
uint32_t thridx_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;
uint32_t thridx_y = threadIdx.y + blockDim.y * blockIdx.y;
uint32_t stride_x = blockDim.x * gridDim.x;
uint32_t stride_y = blockDim.y * gridDim.y;
float3 distance3 = make_float3(0.0f, 0.0f, 0.0f);
float distance = 0;
uint32_t output_start, output_end;
for(uint32_t x = thridx_x; x < last_idx_x; x += stride_x){
// distance calcs
distance3.x = pos_a[x].x - pos_b[x].x;
distance3.y = pos_a[x].y - pos_b[x].y;
distance3.z = pos_a[x].z - pos_b[x].z;
distance = length(distance3);
output_start = __fdiv_rz(output_size, 2) + __fdiv_rz(distance, output_size) - region;
output_end = output_start + region;
for(uint32_t y = thridx_y; y < output_size; y += stride_y){
float val = 1;
if((y < output_end) && (y >= output_start)){
float2 lval = myKernel(val);
atomicAdd(&result[y].x, lval.x);
atomicAdd(&result[y].y, lval.y);
}
}
}
}
bool eval_arrays_equal(float2 * d_atomic, float2 * d_mod, uint32_t n) {
if (d_atomic == nullptr || d_mod == nullptr) {
throw std::invalid_argument("Arrays are NULL.");
}
if (n < 1) {
throw std::invalid_argument("Invalid array length, less than 1.");
}
float2 * h_atomic;
float2 * h_mod;
size_t sz = n * sizeof(float2);
h_atomic = (float2*)malloc(sz);
h_mod = (float2*)malloc(sz);
GPU_ERROR_CHECK(cudaMemcpy(h_atomic, d_atomic, sz, cudaMemcpyDeviceToHost));
GPU_ERROR_CHECK(cudaMemcpy(h_mod, d_mod, sz, cudaMemcpyDeviceToHost));
GPU_ERROR_CHECK(cudaDeviceSynchronize());
for (uint32_t i = 0; i < n; ++i) {
if (h_atomic[i].x != h_mod[i].x || h_atomic[i].y != h_mod[i].y) {
printf("\ti (%i): atomic (%f,%f), modified (%f,%f)\n", i, h_atomic[i].x, h_atomic[i].y, h_mod[i].x, h_mod[i].y);
return false;
}
}
free(h_atomic);
free(h_mod);
// Every element is equal
return true;
}
int main(int argc, char * argv[]) {
if (argc != 2) {
fprintf(stderr, "\nPass the number of array elements via command line as follows:\n");
fprintf(stderr, "./xTest <num_elems>\n\n");
return EXIT_FAILURE;
}
// Dimensions
dim3 nthreads(512,1,1);
dim3 nblocks(1,512,1);
uint32_t n_values = static_cast<uint32_t>(std::stoi(argv[1]));
uint32_t region = 3;
uint32_t n_float3s = n_values;
uint32_t float3_sz = n_float3s * sizeof(float3);
uint32_t output_sz = n_values * sizeof(float2);
// Allocate host & device side
float2 *d_out_org;
float2 *d_out_mod;
GPU_ERROR_CHECK(cudaMalloc(&d_out_org, output_sz));
GPU_ERROR_CHECK(cudaMalloc(&d_out_mod, output_sz));
GPU_ERROR_CHECK(cudaMemset(d_out_org, 0, output_sz));
GPU_ERROR_CHECK(cudaMemset(d_out_mod, 0, output_sz));
// Float3s
float3 *pos_a, *pos_b;
float3 *d_pos_a, *d_pos_b;
pos_a = (float3*)malloc(float3_sz);
pos_b = (float3*)malloc(float3_sz);
for(size_t p = 0; p < n_float3s; ++p){
pos_a[p] = make_float3(1,1,1);
pos_b[p] = make_float3(0.1,0.1,0.1);
}
GPU_ERROR_CHECK(cudaMalloc(&d_pos_a, float3_sz));
GPU_ERROR_CHECK(cudaMalloc(&d_pos_b, float3_sz));
GPU_ERROR_CHECK(cudaMemcpy(d_pos_a, pos_a, float3_sz, cudaMemcpyHostToDevice));
GPU_ERROR_CHECK(cudaMemcpy(d_pos_b, pos_b, float3_sz, cudaMemcpyHostToDevice));
GPU_ERROR_CHECK(cudaDeviceSynchronize());
float total_time_org = 0.0f;
float total_time_mod = 0.0f;
uint32_t first_idx_x = 0;
uint32_t last_idx_x = n_values;
auto start = std::chrono::high_resolution_clock::now();
org_kernel<<<nthreads,nblocks,0,0>>>(d_pos_a, d_pos_b, n_values, n_values, d_out_org, region, first_idx_x, last_idx_x);
GPU_ERROR_CHECK(cudaDeviceSynchronize());
auto stop = std::chrono::high_resolution_clock::now();
total_time_org = static_cast<float>(std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count());
size_t sharedMemSize = 2 * nthreads.x * sizeof(float3);
start = std::chrono::high_resolution_clock::now();
mod_kernel<<<nthreads,nblocks,sharedMemSize,0>>>(d_pos_a, d_pos_b, n_values, n_values, d_out_mod, region, first_idx_x, last_idx_x);
GPU_ERROR_CHECK(cudaDeviceSynchronize());
stop = std::chrono::high_resolution_clock::now();
total_time_mod = static_cast<float>(std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count());
// Check for fidelity
if (eval_arrays_equal(d_out_org, d_out_mod, n_values)) {
printf("\nFidelity achieved.\n");
printf("\t[ORIGINAL] Time: %8.9f (us.)\n", total_time_org);
printf("\t[MODIFIED] Time: %8.9f (us.)\n", total_time_mod);
} else {
printf("\nAtomic and Optimized kernels do not match.\n");
return EXIT_FAILURE;
}
GPU_ERROR_CHECK(cudaPeekAtLastError());
GPU_ERROR_CHECK(cudaDeviceSynchronize());
GPU_ERROR_CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}