Max shared memory

Hello everyone,

I am experiencing troubles trying to increase the size of my shared memory.

I need to have the biggest array possible in the shared memory. My GPU is the RTX 2080 Super with compute capability 7.5. Programming guide says that the max shared memory per multiprocessor is 64kB and the max per block is the same. It also says that a single thread can access the whole shared memory of a block. I declared the shared memory dynamically as I want to extend it to more than 48kB as explained in the programming guide.

However, when I try to launch a kernel with more than 49152 bytes of shared memory (48kB), I get an “invalid argument” error.
I have tried to use cudaFuncSetAttribute, cudaDeviceSetCacheConfig and cudaFuncSetCacheConfig, but none of them had any effect.
When I try to get the Device Properties sharedMemPerBlock and sharedMemPerMultiprocessor, it returns 49152 for sharedMemPerBlock and 65536 for sharedMemPerMultiprocessor.

Here is my code :

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include <cuda_runtime_api.h>

#define BLOCKS_NB		48
#define THREADS_PER_BLOCK	1

__global__ void TestFunction() {

	extern __shared__ uint32_t myTab[];

	uint32_t t = (threadIdx.x);
	uint32_t b = (blockIdx.x);
	uint32_t array_size = 12288;
	uint32_t mySum  = 0;
	for (uint32_t tab_ind_start = t; tab_ind_start < array_size; tab_ind_start += THREADS_PER_BLOCK)
	{
		myTab[tab_ind_start] = 1;
	}
	__syncthreads();
	
	if (t == 0) {
		for (uint32_t ind = 0; ind < array_size; ind++)
		{
			mySum += myTab[ind];
		}

		printf("Sum = %lu\n", mySum);
	}

}

int main()
{
	uint32_t maxbytes = 65536; // 64 KB
	cudaError_t error = cudaFuncSetAttribute(TestFunction, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);

	//uint32_t  carveout = 100;
	//cudaError_t error = cudaFuncSetAttribute(TestFunction, cudaFuncAttributePreferredSharedMemoryCarveout, carveout);

	//cudaError_t error = cudaFuncSetAttribute(TestFunction, cudaFuncAttributePreferredSharedMemoryCarveout, cudaSharedmemCarveoutMaxShared);

	//cudaError_t error = cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);

	//cudaError_t error = cudaFuncSetCacheConfig(TestFunction, cudaFuncCachePreferShared);

	printf("Cache error: %s\n", cudaGetErrorString(error));
	
	dim3  grid(BLOCKS_NB, 1, 1);
	dim3  threads(THREADS_PER_BLOCK, 1, 1);
	size_t mem_size = 49152;

	cudaDeviceProp prop;
	cudaGetDeviceProperties(&prop, 0);
	printf("sh/b = %d\n", prop.sharedMemPerBlock);
	printf("sh/mp = %d\n", prop.sharedMemPerMultiprocessor);
	printf("L2 size = %d\n", prop.l2CacheSize);

	// execute the kernel
	TestFunction <<< grid, threads, mem_size >>> ();

	cudaDeviceSynchronize();
	printf("CUDA error: %s\n", cudaGetErrorString(cudaGetLastError()));
	return 0;
}

Does anyone see a problem in my code or have an idea of what the problem is ?

Thank you in advance.