How can one determine the temp storage size for CUB block primitives from the host?

Hi, I want to limit shared memory usage by reusing an existing shared memory allocation for CUB.
Something like:

size_t smemCub = ...
size_t smemMyAlgorithm = ...
size_t smemForKernel = max(smemCub, smemMyAlgorithm)
kernel<<<grid, block, smemForKernel>>>();

How can I compute the required amount of shared memory for CUB from the host?
The following code outputs different sizes of shared memory on the host and on the device.

#include <cub/cub.cuh>
#include <cstdio>

constexpr int blocksize = 128;

using BlockReduceInt = cub::BlockReduce<int, blocksize>;
using BlockReduceIntStorage = typename BlockReduceInt::TempStorage;

__global__
void kernel(){
    using BlockReduceInt = cub::BlockReduce<int, blocksize>;
    using BlockReduceIntStorage = typename BlockReduceInt::TempStorage;

    size_t size = sizeof(BlockReduceIntStorage);

    printf("device size %lu\n", size);
}

int main(){

    using BlockReduceInt = cub::BlockReduce<int, blocksize>;
    using BlockReduceIntStorage = typename BlockReduceInt::TempStorage;

    size_t size = sizeof(BlockReduceIntStorage);

    printf("host size %lu\n", size);

    kernel<<<1,1>>>();
    
    cudaDeviceSynchronize();
}
host size 980
device size 24

The problem is that CUDA_ARCH which is a defaulted template argument is not defined in host code. I found the following solution for the case when CUDA_ARCH matches the SM version of the device.

int major = 0;
    int minor = 0;

    cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, 0);
    cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, 0);

    int arch = major * 100 + minor * 10;

    printf("arch = %d\n", arch);

    size_t size = 0;
    
    switch(arch){
        case 610: {
            using BlockReduceInt = cub::BlockReduce<int, blocksize, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 1, 1, 610>;
            using BlockReduceIntStorage = typename BlockReduceInt::TempStorage;

            size = sizeof(BlockReduceIntStorage);
            break;
        }
        default:{
            using BlockReduceInt = cub::BlockReduce<int, blocksize, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 1, 1, 200>;
            using BlockReduceIntStorage = typename BlockReduceInt::TempStorage;

            size = sizeof(BlockReduceIntStorage);
            break;
        }
    }