How to use cudaOccupancyMaxPotentialBlockSizeVariableSMem

here is simple code:

#include <stdio.h>
#include <cuda_runtime.h>
__global__ void myKernel(int *data)
{
extern __shared__ int shared[];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
shared[tid] = tid;
__syncthreads();
data[tid] = shared[tid];
}
int main()
{
int numThreads = 256;
int numBlocks;
int maxBlockSize;
int numSm = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
numSm = deviceProp.multiProcessorCount;
int sharedMemSize = numThreads * sizeof(int);
cudaOccupancyMaxPotentialBlockSizeVariableSMem(&numBlocks, &maxBlockSize, myKernel, 0);
printf("Max potential block size: %d\n", maxBlockSize);
printf("Number of blocks: %d\n", numBlocks);
int totalThreads = numBlocks * maxBlockSize;
printf("Total number of threads: %d\n", totalThreads);
int *d_data;
int dataSize = totalThreads * sizeof(int);
cudaMalloc((void **)&d_data, dataSize);
myKernel<<<numBlocks, maxBlockSize, sharedMemSize>>>(d_data);
cudaDeviceSynchronize();
cudaFree(d_data);
cudaDeviceReset();
return 0;
}

and i got this error:

usr/local/cuda-12.2/bin/…/targets/x86_64-linux/include/cuda_runtime.h(1871): error: expression preceding parentheses of apparent call must have (pointer-to-) function type

dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);

^

detected during:

instantiation of “cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int *, int , T, UnaryFunction, int, unsigned int) [with UnaryFunction=int, T=void ()(int *)]” at line 1951

instantiation of “cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMem(int *, int , T, UnaryFunction, int) [with UnaryFunction=int, T=void ()(int *)]” at line 28 of test.cu

The 4th parameter must be a function object

cudaOccupancyMaxPotentialBlockSizeVariableSMem ( int* minGridSize, int* blockSize, T func, UnaryFunction blockSizeToDynamicSMemSize, int blockSizeLimit = 0 )

blockSizeToDynamicSMemSize

  • A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block

could you give me an example, please

#include <stdio.h>
#include <cuda_runtime.h>
__global__ void myKernel(int *data)
{
    extern __shared__ int shared[];
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    shared[tid] = tid;
    __syncthreads();
    data[tid] = shared[tid];
}

struct ConvertBlockSizeToDynamicSMemSize{
    __host__ __device__
    size_t operator()(int blocksize){
        //change this to whatever amount you need for this blocksize
        return sizeof(int) * blocksize;
    }
};

int main()
{
    int numThreads = 256;
    int numBlocks;
    int maxBlockSize;
    int numSm = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    numSm = deviceProp.multiProcessorCount;
    int sharedMemSize = numThreads * sizeof(int);
    cudaOccupancyMaxPotentialBlockSizeVariableSMem(&numBlocks, &maxBlockSize, myKernel, ConvertBlockSizeToDynamicSMemSize());
    printf("Max potential block size: %d\n", maxBlockSize);
    printf("Number of blocks: %d\n", numBlocks);
    int totalThreads = numBlocks * maxBlockSize;
    printf("Total number of threads: %d\n", totalThreads);
    int *d_data;
    int dataSize = totalThreads * sizeof(int);
    cudaMalloc((void **)&d_data, dataSize);
    myKernel<<<numBlocks, maxBlockSize, sharedMemSize>>>(d_data);
    cudaDeviceSynchronize();
    cudaFree(d_data);
    cudaDeviceReset();
    return 0;
}

first of all thanks for your help, btw i still have a little confused

here the code:

#include <stdio.h>

__global__ void kernel()
{
    printf("Hello World from thread %d of block %d\n", threadIdx.x, blockIdx.x);
}

int main()
{
    int N = 256;
    int blockSize;
    int minGridSize;
    int gridSize;

    // Calculate the maximum potential block size and minimum grid size
    cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, kernel, 0, N);
    gridSize = (N + blockSize - 1) / blockSize;

    printf("Recommended Block Size: %d\n", blockSize);
    printf("Minimum Grid Size: %d\n", minGridSize);
    printf("Grid Size: %d\n", gridSize);

     return 0;
}

when using cudaOccupancyMaxPotentialBlockSize or cudaOccupancyMaxPotentialBlockSizeVariableSMem i still have the same maxBlockSize,numBlocks. I thought when using cudaOccupancyMaxPotentialBlockSizeVariableSMem it will return the occupancy of tile ( the blocksize of shared memory ) ? or maybe the occupancy of shared is the same as global?

It is not clear to me what you are trying to achieve. The occupancy functions return a number of threads per block or a number of threadblocks per grid, not a shared memory configuration

if you said so what the difference between cudaOccupancyMaxPotentialBlockSizeVariableSMem and cudaOccupancyMaxPotentialBlockSize? because both them return the same BlockSize,numBlocks which mean we just need to use cudaOccupancyMaxPotentialBlockSize

I would suggest taking a look at the API references. CUDA Runtime API :: CUDA Toolkit Documentation

cudaOccupancyMaxPotentialBlockSizeVariableSMem if the amount of per-block dynamic shared memory changes with different block sizes.