One thread write into shared memory has limit?

Hi! I am writing 64 values into shared memory in one thread, and 256 threads are doing the same thing into different shared memory location, generally will fill 256*64 values in shared memory. But I find out, some lines will randomly loss!??? Each time different!!! I promise I have used __syncthreads(). I am wondering whether any restriction here.

Thank you!!!

#include<iostream>
using namespace std;
#include "cuda_runtime.h"
#define FETCH_FLOAT4(pointer) (reinterpret_cast<float4*>(&(pointer))[0])
__global__ void test()
{
	extern __shared__ __align__(16 * 1024) float smem[];
	float* smem_a = smem;

	int tx16 = threadIdx.x % 16;
	int ty16 = threadIdx.x / 16;

	float4 f4_rand = make_float4(0, 0, 0, 0);
	float4 c[8][2] = { { f4_rand } };
	{
	c[0][0].x = 1, c[0][0].y = 1, c[0][0].z = 1, c[0][0].w = 1, c[0][1].x = 1, c[0][1].y = 1, c[0][1].z = 1, c[0][1].w = 1, c[1][0].x = 1, c[1][0].y = 1, c[1][0].z = 1, c[1][0].w = 1, c[1][1].x = 1, c[1][1].y = 1, c[1][1].z = 1, c[1][1].w = 1, c[2][0].x = 1, c[2][0].y = 1, c[2][0].z = 1, c[2][0].w = 1, c[2][1].x = 1, c[2][1].y = 1, c[2][1].z = 1, c[2][1].w = 1, c[3][0].x = 1, c[3][0].y = 1, c[3][0].z = 1, c[3][0].w = 1, c[3][1].x = 1, c[3][1].y = 1, c[3][1].z = 1, c[3][1].w = 1, c[4][0].x = 1, c[4][0].y = 1, c[4][0].z = 1, c[4][0].w = 1, c[4][1].x = 1, c[4][1].y = 1, c[4][1].z = 1, c[4][1].w = 1, c[5][0].x = 1, c[5][0].y = 1, c[5][0].z = 1, c[5][0].w = 1, c[5][1].x = 1, c[5][1].y = 1, c[5][1].z = 1, c[5][1].w = 1, c[6][1].x = 1, c[6][1].y = 1, c[6][1].z = 1, c[6][1].w = 1, c[6][0].x = 1, c[6][0].y = 1, c[6][0].z = 1, c[6][0].w = 1, c[7][0].x = 1, c[7][0].y = 1, c[7][0].z = 1, c[7][0].w = 1, c[7][1].x = 1, c[7][1].y = 1, c[7][1].z = 1, c[7][1].w = 1;
	}  // All set to 1

	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*0]) = c[0][0];
	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128 * 1]) = c[0][1];
	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*2]) = c[1][0];
	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*3]) = c[1][1];

	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 0+64]) = c[4][0];
	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 1 + 64]) = c[4][1];
	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 2 + 64]) = c[5][0];
	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 3 + 64]) = c[5][1];
	__syncthreads();

	if (threadIdx.x == 0 && blockIdx.x == 0) {
		for (int ii = 0; ii < 128; ii++) {
			for (int jj = 0; jj < 128; jj++) {
				if (smem_a[ii * 128 + jj] != 0) {
					printf("result[%d][%d]=%f  ", ii, jj, smem_a[ii * 128 + jj]);
				}
			}
			printf("\n");
		}
		printf("\n");
	}
	__syncthreads();}
int main(){
	dim3 grid(1, 1);
	int maxbytes = 81 * 1024; // 81 KB
	cudaFuncSetAttribute(test, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
	test << <grid, 256, 81 * 1024 >> > ();}

I am using ampere structure, but no worry, if you are turing strucutre like 2070 or 1650, try this:

#include<iostream>
using namespace std;
#include "cuda_runtime.h"
#define FETCH_FLOAT4(pointer) (reinterpret_cast<float4*>(&(pointer))[0])
__global__ void test()
{
	extern __shared__ __align__(16 * 1024) float smem[];
	float* smem_a = smem;

	int tx16 = threadIdx.x % 16;
	int ty16 = threadIdx.x / 16;

	float4 f4_rand = make_float4(0, 0, 0, 0);
	float4 c[8][2] = { { f4_rand } };
	{
	c[0][0].x = 1, c[0][0].y = 1, c[0][0].z = 1, c[0][0].w = 1, c[0][1].x = 1, c[0][1].y = 1, c[0][1].z = 1, c[0][1].w = 1, c[1][0].x = 1, c[1][0].y = 1, c[1][0].z = 1, c[1][0].w = 1, c[1][1].x = 1, c[1][1].y = 1, c[1][1].z = 1, c[1][1].w = 1, c[2][0].x = 1, c[2][0].y = 1, c[2][0].z = 1, c[2][0].w = 1, c[2][1].x = 1, c[2][1].y = 1, c[2][1].z = 1, c[2][1].w = 1, c[3][0].x = 1, c[3][0].y = 1, c[3][0].z = 1, c[3][0].w = 1, c[3][1].x = 1, c[3][1].y = 1, c[3][1].z = 1, c[3][1].w = 1, c[4][0].x = 1, c[4][0].y = 1, c[4][0].z = 1, c[4][0].w = 1, c[4][1].x = 1, c[4][1].y = 1, c[4][1].z = 1, c[4][1].w = 1, c[5][0].x = 1, c[5][0].y = 1, c[5][0].z = 1, c[5][0].w = 1, c[5][1].x = 1, c[5][1].y = 1, c[5][1].z = 1, c[5][1].w = 1, c[6][1].x = 1, c[6][1].y = 1, c[6][1].z = 1, c[6][1].w = 1, c[6][0].x = 1, c[6][0].y = 1, c[6][0].z = 1, c[6][0].w = 1, c[7][0].x = 1, c[7][0].y = 1, c[7][0].z = 1, c[7][0].w = 1, c[7][1].x = 1, c[7][1].y = 1, c[7][1].z = 1, c[7][1].w = 1;
	}

	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*0]) = c[0][0];
	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128 * 1]) = c[0][1];
	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*2]) = c[1][0];
	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*3]) = c[1][1];

	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 0+64]) = c[4][0];
	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 1 + 64]) = c[4][1];
	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 2 + 64]) = c[5][0];
	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 3 + 64]) = c[5][1];
	__syncthreads();

	if (threadIdx.x == 0 && blockIdx.x == 0) {
		for (int ii = 0; ii < 64; ii++) {
			for (int jj = 0; jj < 128; jj++) {
				if (smem_a[ii * 128 + jj] != 0) {
					printf("result[%d][%d]=%f  ", ii, jj, smem_a[ii * 128 + jj]);
				}
			}
			printf("\n");
		}
		printf("\n");
	}
	__syncthreads();}
int main(){
	dim3 grid(1, 1);
	int maxbytes = 64 * 1024; // 81 KB
	CHECK(cudaFuncSetAttribute(test, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes));
	test << <grid, 256, 64 * 1024 >> > ();
	cudaDeviceSynchronize();
}

Well…I find out this is my 3050’s problem…In 2070 and A100, it is fine!!! Why this happen??? I know I do not have ECC, but 2070 also does not have! Why???I am very confused…

A 2070 is a cc7.5 device and does not support allocation of 81KB of shared memory per threadblock (the maximum is 64KB). Any time you are asking for help on a forum, I would encourage you to first deploy proper CUDA error checking.

Additionally, I consider this pattern problematic:

int main(){

  ...
  kernel<<<...>>>(...);
}

You should always use a cudaDeviceSynchronize() or equivalent blocking function (e.g. cudaMemcpy()) after a kernel launch, and before reaching the end of main. Yes, I understand on certain platforms it may appear to be unnecessary. I would not rely on that. Do as you wish, of course.

This is also confusing to me:

			if (smem_a[ii * 128 + jj] != 0) {
				printf("result[%d][%d]=%f  ", ii, jj, smem_a[ii * 128 + jj]);

It appears to me that you are setting shared memory float locations to 1.0. When I run your code on a cc7.0 device, I see lots of printout of 1.0 values. Why are you testing against zero? What does a printout of all 1.0 values tell us? I can’t draw any conclusions from it.

A further observation:

Each thread is writing 8 float4 locations in shared memory. You have 256 threads. So that is a maximum of 1024 float4 locations in shared memory that could possibly be set to 1 (without trying to diagnose if you have overlapping indexing). But you’re allocating 81*1024 = 81KBytes of shared memory, dividing that by 16 bytes per float4 location, we have 5184 total float4 locations available in your shared allocation. Therefore after your loading of shared memory, you will have a mix of ones and zeroes. Your doubly-nested for-loops that are checking the result are iterating over 64x128 = 8192 float locations or, alternatively, 2048 float4 locations. Since you are setting a maximum of 1024 float4 locations, and iterating over 2048 float4 locations, I’m not sure what you are trying to demonstrate, but expecting all ones or all zeroes is not sensible from what I can see here.

Thank you!
As I mentioned, I provided two versions, the upper for ampere and lower for turing, which all follow the shared memory limit.
Also, I am actually trying to save the calculation result in the shared memory, I set them all to one just for convenience…Not familiar with curand…
Maybe you can double check why 3050 can not print all the value? Maybe it’s my device’s production problem, but I believe more likely to be…some inner limit I do not know…I am attaching my deviceQuery below for your reference:

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "NVIDIA GeForce RTX 3050 Laptop GPU"
  CUDA Driver Version / Runtime Version          11.7 / 11.7
  CUDA Capability Major/Minor version number:    8.6
  Total amount of global memory:                 4096 MBytes (4294443008 bytes)
  GPU Max Clock rate:                            1740 MHz (1.74 GHz)
  Memory Clock rate:                             6001 Mhz
  Memory Bus Width:                              128-bit
  L2 Cache Size:                                 1572864 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total shared memory per multiprocessor:        102400 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 5 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)
  Device supports Unified Addressing (UVA):      Yes
  Device supports Managed Memory:                Yes
  Device supports Compute Preemption:            Yes
  Supports Cooperative Kernel Launch:            Yes
  Supports MultiDevice Co-op Kernel Launch:      No
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 11.7, CUDA Runtime Version = 11.7, NumDevs = 1

Someone suggest me it might be related to: I should use “graphic memory sync? cuda barrier?” but not “thread sync”. I do not understand…Do you know what does this mean? Thank you!!!