#include <stdio.h>
#include <cuda_runtime.h>
__global__ void myKernel(int *data)
{
extern __shared__ int shared[];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
shared[tid] = tid;
__syncthreads();
data[tid] = shared[tid];
}
int main()
{
int numThreads = 256;
int numBlocks;
int maxBlockSize;
int numSm = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
numSm = deviceProp.multiProcessorCount;
int sharedMemSize = numThreads * sizeof(int);
cudaOccupancyMaxPotentialBlockSizeVariableSMem(&numBlocks, &maxBlockSize, myKernel, 0);
printf("Max potential block size: %d\n", maxBlockSize);
printf("Number of blocks: %d\n", numBlocks);
int totalThreads = numBlocks * maxBlockSize;
printf("Total number of threads: %d\n", totalThreads);
int *d_data;
int dataSize = totalThreads * sizeof(int);
cudaMalloc((void **)&d_data, dataSize);
myKernel<<<numBlocks, maxBlockSize, sharedMemSize>>>(d_data);
cudaDeviceSynchronize();
cudaFree(d_data);
cudaDeviceReset();
return 0;
}
and i got this error:
usr/local/cuda-12.2/bin/…/targets/x86_64-linux/include/cuda_runtime.h(1871): error: expression preceding parentheses of apparent call must have (pointer-to-) function type
instantiation of “cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int *, int , T, UnaryFunction, int, unsigned int) [with UnaryFunction=int, T=void ()(int *)]” at line 1951
instantiation of “cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMem(int *, int , T, UnaryFunction, int) [with UnaryFunction=int, T=void ()(int *)]” at line 28 of test.cu
first of all thanks for your help, btw i still have a little confused
here the code:
#include <stdio.h>
__global__ void kernel()
{
printf("Hello World from thread %d of block %d\n", threadIdx.x, blockIdx.x);
}
int main()
{
int N = 256;
int blockSize;
int minGridSize;
int gridSize;
// Calculate the maximum potential block size and minimum grid size
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, kernel, 0, N);
gridSize = (N + blockSize - 1) / blockSize;
printf("Recommended Block Size: %d\n", blockSize);
printf("Minimum Grid Size: %d\n", minGridSize);
printf("Grid Size: %d\n", gridSize);
return 0;
}
when using cudaOccupancyMaxPotentialBlockSize or cudaOccupancyMaxPotentialBlockSizeVariableSMem i still have the same maxBlockSize,numBlocks. I thought when using cudaOccupancyMaxPotentialBlockSizeVariableSMem it will return the occupancy of tile ( the blocksize of shared memory ) ? or maybe the occupancy of shared is the same as global?
It is not clear to me what you are trying to achieve. The occupancy functions return a number of threads per block or a number of threadblocks per grid, not a shared memory configuration
if you said so what the difference between cudaOccupancyMaxPotentialBlockSizeVariableSMem and cudaOccupancyMaxPotentialBlockSize? because both them return the same BlockSize,numBlocks which mean we just need to use cudaOccupancyMaxPotentialBlockSize