Hi, I want to limit shared memory usage by reusing an existing shared memory allocation for CUB.
Something like:
size_t smemCub = ...
size_t smemMyAlgorithm = ...
size_t smemForKernel = max(smemCub, smemMyAlgorithm)
kernel<<<grid, block, smemForKernel>>>();
How can I compute the required amount of shared memory for CUB from the host?
The following code outputs different sizes of shared memory on the host and on the device.
#include <cub/cub.cuh>
#include <cstdio>
constexpr int blocksize = 128;
using BlockReduceInt = cub::BlockReduce<int, blocksize>;
using BlockReduceIntStorage = typename BlockReduceInt::TempStorage;
__global__
void kernel(){
using BlockReduceInt = cub::BlockReduce<int, blocksize>;
using BlockReduceIntStorage = typename BlockReduceInt::TempStorage;
size_t size = sizeof(BlockReduceIntStorage);
printf("device size %lu\n", size);
}
int main(){
using BlockReduceInt = cub::BlockReduce<int, blocksize>;
using BlockReduceIntStorage = typename BlockReduceInt::TempStorage;
size_t size = sizeof(BlockReduceIntStorage);
printf("host size %lu\n", size);
kernel<<<1,1>>>();
cudaDeviceSynchronize();
}
host size 980
device size 24