According to my tests, the usage of local on-chip shared memory doesn’t seem to bring any performance benefit in Vulkan compute shaders on Nvidia GPUs.
I have written a test shader that demonstrates this behavior and it is ~30x slower (15 ms vs. 0.5 ms) on Nvidia Vulkan than on CUDA or on Vulkan with other manufacturers’ GPU. I used GeForce RTX 2080 Ti, driver 496.49, Vulkan 1.2.189.2, and CUDA 11.1. The test shaders are below.
Vulkan:
#version 450
#extension GL_EXT_control_flow_attributes : enable
#define LOCAL_SIZE_X 128
#define LOCAL_SIZE_Y 1
#define LOCAL_SIZE_Z 1
layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
layout(std430, set = 0, binding = 0) readonly buffer InTensor
{
float data[];
} inTensor;
layout(std430, set = 0, binding = 1) readonly buffer WeightsTensor
{
float data[];
} weights;
layout(std430, set = 0, binding = 2) readonly buffer BiasesTensor
{
float data[];
} biases;
layout(std430, set = 0, binding = 3) writeonly buffer OutTensor
{
float data[];
} outTensor;
layout(set = 0, binding = 4) uniform Uniforms
{
int inChannelCount;
int outChannelCount;
int inH;
int inW;
int outH;
int outW;
ivec2 padding;
} uniforms;
shared float cache[LOCAL_SIZE_X];
shared float outCache[LOCAL_SIZE_X];
shared float weightCache[LOCAL_SIZE_X];
void main()
{
ivec3 gIds = ivec3(gl_GlobalInvocationID.xyz);
ivec3 lIds = ivec3(gl_LocalInvocationID.xyz);
int inChannels = uniforms.inChannelCount;
int outChannels = uniforms.outChannelCount;
int inSliceSize = uniforms.inH * uniforms.inW;
int nIdx = int(gl_WorkGroupID.x);
int x = gIds.y;
int y = gIds.z;
int inIdx = nIdx * inChannels * inSliceSize + lIds.x * inSliceSize + y * uniforms.inW + x;
cache[lIds.x] = inTensor.data[inIdx];
weightCache[lIds.x] = weights.data[lIds.x];
barrier();
int halfIdx = lIds.x / 64;
float outVal = 0;
int startC = halfIdx * 64;
[[unroll]] for (int inCidx = startC; inCidx < (startC + 64); ++inCidx)
{
int weightIdx = (lIds.x - startC) * inChannels + inCidx;
outVal += cache[inCidx] * weightCache[inCidx];
}
outCache[lIds.x] = outVal;
barrier();
if (halfIdx > 0) return;
int outSliceSize = uniforms.outH * uniforms.outW;
int outIdx = nIdx * outChannels * outSliceSize + lIds.x * outSliceSize + y * uniforms.outW + x;
outTensor.data[outIdx] = outCache[lIds.x] + outCache[lIds.x + 64] + biases.data[lIds.x];
}
CUDA:
#define LOCAL_SIZE_X 128
#define LOCAL_SIZE_Y 1
#define LOCAL_SIZE_Z 1
static constexpr int N = 2;
static constexpr int C_in = 128;
static constexpr int C_out = 64;
static constexpr int H = 108;
static constexpr int W = 180;
__global__ void convolution(const float* inTensor, const float* weights, const float* biases, float* outTensor)
{
__shared__ float cache[LOCAL_SIZE_X];
__shared__ float outCache[LOCAL_SIZE_X];
__shared__ float weightCache[LOCAL_SIZE_X];
// Global thread indices
size_t gy = blockIdx.y * blockDim.y + threadIdx.y;
size_t gz = blockIdx.z * blockDim.z + threadIdx.z;
// Local thread indices
size_t lx = threadIdx.x;
int inSliceSize = H * W;
int nIdx = int(blockIdx.x);
int x = gy;
int y = gz;
int inIdx = nIdx * C_in * inSliceSize + lx * inSliceSize + y * W + x;
cache[lx] = inTensor[inIdx];
weightCache[lx] = weights[lx];
__syncthreads();
int halfIdx = lx / 64;
float outVal = 0;
int startC = halfIdx * 64;
for (int inCidx = startC; inCidx < (startC + 64); ++inCidx)
{
int weightIdx = (lx - startC) * C_in + inCidx;
outVal += cache[inCidx] * weightCache[inCidx];
}
outCache[lx] = outVal;
__syncthreads();
if (halfIdx > 0) return;
int outSliceSize = H * W;
int outIdx = nIdx * C_out * outSliceSize + lx * outSliceSize + y * W + x;
outTensor[outIdx] = outCache[lx] + outCache[lx + 64] + biases[lx];
}