Hi, I ran into a performance issue using the CUDA driver API to set a shared memory carveout before submitting a kernel. I have a multi-threaded application where I’d like to submit overlapping memory transfers and kernel launches from several threads. Ideally, kernels of one thread should launch and execute at the same time a memory transfer launched from another thread is in progress. That is indeed happenning correctly until I need to specify the shared memory carveout for a kernel using cuFuncSetAttribute
with either CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
or CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
. I need to do this, because cuLaunchKernel
fails for dynamic shared memory allocations above 48 kB without the attribute.
I found that cuFuncSetAttribute
introduces a lock which seems to be waiting until the async memcpy finishes. It happens regardless of the number passed in there (even 1 byte). Naively, I thought these operations should have no dependency, so I’m puzzled why this is happening.
Here’s a minimal reproducer:
#include <cuda.h>
#include <iostream>
#include <thread>
#include <chrono>
#include <pthread.h>
#include <vector>
#include <cstring>
#define CHECK(x) do { \
auto err{x}; \
if (err!=CUDA_SUCCESS) { \
std::cerr << "CUDA Error " \
<< err << " in " #x << "\n"; \
} \
} while (0)
unsigned int blockSize = 480;
unsigned int num_elements = 1201920;
CUfunction kernel_function;
float value_to_add{1.0f};
bool set_attr{false};
bool verifyOutput(float *outputData, int num) {
for (int i = 0; i < num; ++i) {
if ((outputData[i] != 1.0f)) return false;
}
return true;
}
void doWork(int iterationsPerThread,
int num_kernel_calls, float *inputData, float *outputData,
int threadId, CUstream stream, CUdeviceptr d_inputData,
CUdeviceptr d_outputData) {
inputData = inputData + threadId*num_elements;
outputData = outputData + threadId*num_elements;
unsigned int gridSize = (num_elements + blockSize - 1) / blockSize;
for (int i = 0; i < iterationsPerThread; i++) {
CHECK(cuMemcpyHtoDAsync(d_inputData, inputData, num_elements * sizeof(float), stream));
for (int k = 0; k < num_kernel_calls; k++) {
void* args[4] = {&d_inputData, &d_outputData, &value_to_add, &num_elements};
if (set_attr) {
CHECK(cuFuncSetAttribute(kernel_function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 1));
}
CHECK(cuLaunchKernel(kernel_function, gridSize, 1, 1, blockSize, 1, 1, 0, stream, args, 0));
}
CHECK(cuMemcpyDtoHAsync(outputData, d_outputData, num_elements * sizeof(float), stream));
}
}
int main(int argc, char** argv) {
if (argc > 1) {
if (std::string{argv[1]} == "-a") {
set_attr = true;
}
}
int numThreads = 4;
int totalIterations = 500;
int num_kernel_calls = 100;
int iterationsPerThread = totalIterations / numThreads;
std::cout << "Threads: " << numThreads << " Total iterations: " << totalIterations << " Iterations per thread: " << iterationsPerThread << "\n";
float* inputData;
float* outputData;
inputData = static_cast<float*>(malloc(num_elements * numThreads * sizeof(float)));
outputData = static_cast<float*>(malloc(num_elements * numThreads * sizeof(float)));
memset(inputData, 0, num_elements * numThreads * sizeof(float));
memset(outputData, 0, num_elements * numThreads * sizeof(float));
CUdevice device;
CUcontext context;
CUmodule module;
CHECK(cuInit(0));
CHECK(cuDeviceGet(&device, 0));
CHECK(cuCtxCreate(&context, 0, device));
CHECK(cuDevicePrimaryCtxRetain(&context, device));
CHECK(cuModuleLoad(&module, "kernel.ptx"));
CHECK(cuModuleGetFunction(&kernel_function, module, "add_value"));
CUstream streams[numThreads];
CUdeviceptr d_inputDataList[numThreads];
CUdeviceptr d_outputDataList[numThreads];
for (int threadId = 0; threadId < numThreads; ++threadId) {
CHECK(cuStreamCreate(&streams[threadId], CU_STREAM_NON_BLOCKING));
CHECK(cuMemAlloc(&d_inputDataList[threadId], num_elements*sizeof(float)));
CHECK(cuMemAlloc(&d_outputDataList[threadId], num_elements*sizeof(float)));
}
std::vector<std::thread> threads;
auto start_time{std::chrono::high_resolution_clock::now()};
for (int threadId = 0; threadId < numThreads; ++threadId) {
threads.push_back(std::thread(
doWork, iterationsPerThread, num_kernel_calls, inputData, outputData,
threadId, streams[threadId], d_inputDataList[threadId], d_outputDataList[threadId]));
}
for (int i = 0; i < threads.size(); i++) {
threads[i].join();
}
CHECK(cuCtxSynchronize());
auto end_time{std::chrono::high_resolution_clock::now()};
std::cout << "Total time: "
<< std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time).count()
<< " s" << std::endl;
if (verifyOutput(outputData, num_elements*numThreads)) {
std::cout << "Verification succeeded" << std::endl;
} else {
std::cout << "Verification failed" << std::endl;
}
return 0;
}
with a minimal kernel code:
extern "C" __global__ void add_value(float *inputData, float *outputData, float add_value, unsigned int num_elements) {
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < num_elements)
outputData[tid] = inputData[tid] + add_value;
}
and I run it like this:
# compile
export CUDA_ARCH=sm_86
nvcc -arch=${CUDA_ARCH} -o kernel.o -c kernel.cu
cuobjdump -ptx kernel.o | sed -n '/^\.version/,$p' > kernel.ptx
nvcc -arch=${CUDA_ARCH} -o main main-cuda.cpp -lcuda
# run without cuFuncSetAttribute
./main
# run with cuFuncSetAttribute
./main -a
The hardcoded totalIterations
and num_kernel_calls
reproduce this well on my PC with RTX 3060, but may need adjusting for bigger/faster hardware.
Here’s what I see in Nsight Systems - without cuFuncSetAttribute above (overlapping operations) and with cuFuncSetAttribute below (locking):
We can see that pthread_rwlock_wrlock
appears in the system trace and no new API calls are made while the async memcpy is ongoing. It looks like the other threads wait on a lock inside cuFuncSetAttribute
.
What is the reason for this synchronisation and is this documented somewhere? Is there any way to avoid it and still launch kernels with large shared memory allocations?