Hi all,
This is my first go at learning about warp primitives to speed up CUDA kernels. I’m encountering a potential problem where using warp shuffles in a kernel I’ve written makes it slower than the kernel without shuffles, as per the profiler reports from Nsight Compute. I’ve included the reports as PNGs at the bottom of this post. I’ve also included the full program which I used for profiling in Nsight Compute, which I have run through the compute-sanitizer
using memcheck
, racecheck
and synccheck
with no errors reported.
The functionality of the kernels is very simple: they make each thread load four contiguous elements from an array, compute their sum and then store the sum. To my understanding, this loading pattern therefore involves a “blocked arrangement”, as described in the CUB library documentation.
Without any shuffles, the kernel looks like:
__global__
void store_kernel
(
int* d_array_load,
int* d_array_store
)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
const int load_idx = 4 * idx;
int buf[4];
buf[0] = d_array_load[load_idx + 0];
buf[1] = d_array_load[load_idx + 1];
buf[2] = d_array_load[load_idx + 2];
buf[3] = d_array_load[load_idx + 3];
d_array_store[idx] = buf[0] + buf[1] + buf[2] + buf[3];
}
Where d_array_load
has four times as many elements as d_array_store
. My hypothesis was that that the strided access pattern to d_array_load
should lead to uncoalesced memory access, which was supported by profiling the kernel in Nsight Compute.
To eliminate uncoalesced access, I wrote the following kernel that uses warp shuffles:
__global__
void store_kernel_warp
(
int* d_array_load,
int* d_array_store
)
{
const int tidx = threadIdx.x;
const int idx = blockIdx.x * blockDim.x + tidx;
const int warp_id = idx / 32;
const int lane_id = tidx % 32;
const int starting_load_idx = warp_id * 128;
int buf[4];
for (int i = 0; i < 4; i++)
{
int load_idx = starting_load_idx + i * 32 + lane_id;
int starting_lane_id = i * 8;
int val = d_array_load[load_idx];
buf[0] = __shfl_sync(0xffffffff, val, 4 * (lane_id % 8) + 0);
buf[1] = __shfl_sync(0xffffffff, val, 4 * (lane_id % 8) + 1);
buf[2] = __shfl_sync(0xffffffff, val, 4 * (lane_id % 8) + 2);
buf[3] = __shfl_sync(0xffffffff, val, 4 * (lane_id % 8) + 3);
if (lane_id >= starting_lane_id && lane_id < starting_lane_id + 8)
{
d_array_store[idx] = buf[0] + buf[1] + buf[2] + buf[3];
}
}
}
The kernel’s strategy for eliminating uncoalesced memory access is as follows: there are 32 threads per warp, whereby each thread in the warp needs to read 4 elements from d_array_load
, i.e. the warp needs to read 4 elements per thread x 32 threads per warp = 128 elements in total. We can thus make the warp perform 4 reads – whereby each read fetches 32 contiguous elements and is therefore coalesced – to give “batches” of 8 threads their desired 4 elements. For example, the first read of 32 elements – which is enough elements to give 8 threads their desired 4 elements because 32 elements / 4 elements per thread = 8 threads – gives the threads 0 to 7 their 4 elements; the second read gives thread 8 to 15 their 4 elements, and so on.
When I profile store_kernel_warp
in Nsight Compute, I no longer get any warnings about uncoalesced global memory access. Based on this, and other information from the profiler report, I’m assuming that using the strategy described above has successfully eliminated uncoalesced memory access.
However, despite eliminating uncoalesced memory access, store_kernel_warp
is actually slower than store_kernel
: 17 us compared to 13 us, respectively. Is this because store_kernel_wrap
is performing more “total warp instructions” than store_kernel
? (Terminology borrowed from this StackOverflow answer by harrism). I’m posing this hypothesis because Nsight Compute says store_kernel_warp
performs 23359 cycles compared to 19548 cycles by store_kernel
. Should I modify the shuffle operations to reduce the number of instructions and thereby achieve a speedup over store_kernel
?
I am really sorry if there is something very elementary that I have missed. I would really appreciate any help, explanations, or links to clear learning materials.
Profiler reports from Nsight Compute
I’m sorry that some of the information in the reports is cut off, there seems to be a display issue with my monitors. The green bars show the baseline metrics obtained from profling store_kernel
.
Profiler report for store_kernel
:
Profiler report for store_kernel_warp
:
Full program used for profiling in Nsight Compute:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdio>
#include <cstdlib>
#include <cassert>
#define CHECK_CUDA_ERROR(ans) { assert_cuda( (ans), __LINE__); }
void assert_cuda
(
cudaError_t error,
int line
);
__host__
cudaError_t copy_cuda
(
void* dst,
void* src,
size_t bytes
);
__host__
void* malloc_device
(
size_t bytes
);
__host__
cudaError_t free_device
(
void* ptr
);
__global__
void store_kernel
(
int* d_array_load,
int* d_array_store
);
__global__
void store_kernel_warp
(
int* d_array_load,
int* d_array_store
);
int main
(
int argc,
char** argv
)
{
const int num_elems_load = 1 << 20;
const int num_elems_store = 1 << 18;
const int block_size = 256;
const int num_threads = num_elems_store;
const int num_blocks = num_threads / block_size;
size_t bytes_load = num_elems_load * sizeof(int);
size_t bytes_store = num_elems_store * sizeof(int);
int* h_array_load = new int[num_elems_load];
int* h_array_store = new int[num_elems_store];
int* d_array_load = (int*)malloc_device(bytes_load);
int* d_array_store = (int*)malloc_device(bytes_store);
for (int i = 0; i < num_elems_load; i++)
{
h_array_load[i] = i % 4;
}
copy_cuda(d_array_load, h_array_load, bytes_load);
//store_kernel<<<num_blocks, block_size>>>(d_array_load, d_array_store);
store_kernel_warp<<<num_blocks, block_size>>>(d_array_load, d_array_store);
copy_cuda(h_array_store, d_array_store, bytes_store);
for (int i = 0; i < num_elems_store; i++)
{
if (h_array_store[i] != 6)
{
fprintf(stderr, "Sum is not 6.\n");
exit(-1);
}
}
delete[] h_array_load;
delete[] h_array_store;
free_device(d_array_load);
free_device(d_array_store);
return 0;
}
void assert_cuda(cudaError_t error, int line)
{
if (error != cudaSuccess)
{
fprintf(stderr, "CUDA error: %s, %d\n", cudaGetErrorString(error), line);
exit(error);
}
}
cudaError_t copy_cuda
(
void* dst,
void* src,
size_t bytes
)
{
cudaError_t error = cudaMemcpy
(
dst,
src,
bytes,
cudaMemcpyDefault
);
return error;
}
__host__
void* malloc_device
(
size_t bytes
)
{
void* ptr;
cudaMalloc
(
&ptr,
bytes
);
return ptr;
}
__host__
cudaError_t free_device
(
void* ptr
)
{
return (nullptr != ptr) ? cudaFree(ptr) : cudaSuccess;
}
__global__
void store_kernel
(
int* d_array_load,
int* d_array_store
)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
const int load_idx = 4 * idx;
int buf[4];
buf[0] = d_array_load[load_idx + 0];
buf[1] = d_array_load[load_idx + 1];
buf[2] = d_array_load[load_idx + 2];
buf[3] = d_array_load[load_idx + 3];
d_array_store[idx] = buf[0] + buf[1] + buf[2] + buf[3];
}
__global__
void store_kernel_warp
(
int* d_array_load,
int* d_array_store
)
{
const int tidx = threadIdx.x;
const int idx = blockIdx.x * blockDim.x + tidx;
const int warp_id = idx / 32;
const int lane_id = tidx % 32;
const int starting_load_idx = warp_id * 128;
int buf[4];
for (int i = 0; i < 4; i++)
{
int load_idx = starting_load_idx + i * 32 + lane_id;
int starting_lane_id = i * 8;
int val = d_array_load[load_idx];
buf[0] = __shfl_sync(0xffffffff, val, 4 * (lane_id % 8) + 0);
buf[1] = __shfl_sync(0xffffffff, val, 4 * (lane_id % 8) + 1);
buf[2] = __shfl_sync(0xffffffff, val, 4 * (lane_id % 8) + 2);
buf[3] = __shfl_sync(0xffffffff, val, 4 * (lane_id % 8) + 3);
if (lane_id >= starting_lane_id && lane_id < starting_lane_id + 8)
{
d_array_store[idx] = buf[0] + buf[1] + buf[2] + buf[3];
}
}
}