One thread write into shared memory has limit?

202476410arsmart · July 27, 2022, 12:26pm

Hi! I am writing 64 values into shared memory in one thread, and 256 threads are doing the same thing into different shared memory location, generally will fill 256*64 values in shared memory. But I find out, some lines will randomly loss!??? Each time different!!! I promise I have used __syncthreads(). I am wondering whether any restriction here.

Thank you!!!

#include<iostream>
using namespace std;
#include "cuda_runtime.h"
#define FETCH_FLOAT4(pointer) (reinterpret_cast<float4*>(&(pointer))[0])
__global__ void test()
{
	extern __shared__ __align__(16 * 1024) float smem[];
	float* smem_a = smem;

	int tx16 = threadIdx.x % 16;
	int ty16 = threadIdx.x / 16;

	float4 f4_rand = make_float4(0, 0, 0, 0);
	float4 c[8][2] = { { f4_rand } };
	{
	c[0][0].x = 1, c[0][0].y = 1, c[0][0].z = 1, c[0][0].w = 1, c[0][1].x = 1, c[0][1].y = 1, c[0][1].z = 1, c[0][1].w = 1, c[1][0].x = 1, c[1][0].y = 1, c[1][0].z = 1, c[1][0].w = 1, c[1][1].x = 1, c[1][1].y = 1, c[1][1].z = 1, c[1][1].w = 1, c[2][0].x = 1, c[2][0].y = 1, c[2][0].z = 1, c[2][0].w = 1, c[2][1].x = 1, c[2][1].y = 1, c[2][1].z = 1, c[2][1].w = 1, c[3][0].x = 1, c[3][0].y = 1, c[3][0].z = 1, c[3][0].w = 1, c[3][1].x = 1, c[3][1].y = 1, c[3][1].z = 1, c[3][1].w = 1, c[4][0].x = 1, c[4][0].y = 1, c[4][0].z = 1, c[4][0].w = 1, c[4][1].x = 1, c[4][1].y = 1, c[4][1].z = 1, c[4][1].w = 1, c[5][0].x = 1, c[5][0].y = 1, c[5][0].z = 1, c[5][0].w = 1, c[5][1].x = 1, c[5][1].y = 1, c[5][1].z = 1, c[5][1].w = 1, c[6][1].x = 1, c[6][1].y = 1, c[6][1].z = 1, c[6][1].w = 1, c[6][0].x = 1, c[6][0].y = 1, c[6][0].z = 1, c[6][0].w = 1, c[7][0].x = 1, c[7][0].y = 1, c[7][0].z = 1, c[7][0].w = 1, c[7][1].x = 1, c[7][1].y = 1, c[7][1].z = 1, c[7][1].w = 1;
	}  // All set to 1

	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*0]) = c[0][0];
	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128 * 1]) = c[0][1];
	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*2]) = c[1][0];
	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*3]) = c[1][1];

	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 0+64]) = c[4][0];
	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 1 + 64]) = c[4][1];
	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 2 + 64]) = c[5][0];
	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 3 + 64]) = c[5][1];
	__syncthreads();

	if (threadIdx.x == 0 && blockIdx.x == 0) {
		for (int ii = 0; ii < 128; ii++) {
			for (int jj = 0; jj < 128; jj++) {
				if (smem_a[ii * 128 + jj] != 0) {
					printf("result[%d][%d]=%f  ", ii, jj, smem_a[ii * 128 + jj]);
				}
			}
			printf("\n");
		}
		printf("\n");
	}
	__syncthreads();}
int main(){
	dim3 grid(1, 1);
	int maxbytes = 81 * 1024; // 81 KB
	cudaFuncSetAttribute(test, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
	test << <grid, 256, 81 * 1024 >> > ();}

I am using ampere structure, but no worry, if you are turing strucutre like 2070 or 1650, try this:

#include<iostream>
using namespace std;
#include "cuda_runtime.h"
#define FETCH_FLOAT4(pointer) (reinterpret_cast<float4*>(&(pointer))[0])
__global__ void test()
{
	extern __shared__ __align__(16 * 1024) float smem[];
	float* smem_a = smem;

	int tx16 = threadIdx.x % 16;
	int ty16 = threadIdx.x / 16;

	float4 f4_rand = make_float4(0, 0, 0, 0);
	float4 c[8][2] = { { f4_rand } };
	{
	c[0][0].x = 1, c[0][0].y = 1, c[0][0].z = 1, c[0][0].w = 1, c[0][1].x = 1, c[0][1].y = 1, c[0][1].z = 1, c[0][1].w = 1, c[1][0].x = 1, c[1][0].y = 1, c[1][0].z = 1, c[1][0].w = 1, c[1][1].x = 1, c[1][1].y = 1, c[1][1].z = 1, c[1][1].w = 1, c[2][0].x = 1, c[2][0].y = 1, c[2][0].z = 1, c[2][0].w = 1, c[2][1].x = 1, c[2][1].y = 1, c[2][1].z = 1, c[2][1].w = 1, c[3][0].x = 1, c[3][0].y = 1, c[3][0].z = 1, c[3][0].w = 1, c[3][1].x = 1, c[3][1].y = 1, c[3][1].z = 1, c[3][1].w = 1, c[4][0].x = 1, c[4][0].y = 1, c[4][0].z = 1, c[4][0].w = 1, c[4][1].x = 1, c[4][1].y = 1, c[4][1].z = 1, c[4][1].w = 1, c[5][0].x = 1, c[5][0].y = 1, c[5][0].z = 1, c[5][0].w = 1, c[5][1].x = 1, c[5][1].y = 1, c[5][1].z = 1, c[5][1].w = 1, c[6][1].x = 1, c[6][1].y = 1, c[6][1].z = 1, c[6][1].w = 1, c[6][0].x = 1, c[6][0].y = 1, c[6][0].z = 1, c[6][0].w = 1, c[7][0].x = 1, c[7][0].y = 1, c[7][0].z = 1, c[7][0].w = 1, c[7][1].x = 1, c[7][1].y = 1, c[7][1].z = 1, c[7][1].w = 1;
	}

	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*0]) = c[0][0];
	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128 * 1]) = c[0][1];
	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*2]) = c[1][0];
	FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*3]) = c[1][1];

	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 0+64]) = c[4][0];
	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 1 + 64]) = c[4][1];
	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 2 + 64]) = c[5][0];
	FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 3 + 64]) = c[5][1];
	__syncthreads();

	if (threadIdx.x == 0 && blockIdx.x == 0) {
		for (int ii = 0; ii < 64; ii++) {
			for (int jj = 0; jj < 128; jj++) {
				if (smem_a[ii * 128 + jj] != 0) {
					printf("result[%d][%d]=%f  ", ii, jj, smem_a[ii * 128 + jj]);
				}
			}
			printf("\n");
		}
		printf("\n");
	}
	__syncthreads();}
int main(){
	dim3 grid(1, 1);
	int maxbytes = 64 * 1024; // 81 KB
	CHECK(cudaFuncSetAttribute(test, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes));
	test << <grid, 256, 64 * 1024 >> > ();
	cudaDeviceSynchronize();
}

202476410arsmart · July 27, 2022, 2:00pm

Well…I find out this is my 3050’s problem…In 2070 and A100, it is fine!!! Why this happen??? I know I do not have ECC, but 2070 also does not have! Why???I am very confused…

Robert_Crovella · July 27, 2022, 2:46pm

A 2070 is a cc7.5 device and does not support allocation of 81KB of shared memory per threadblock (the maximum is 64KB). Any time you are asking for help on a forum, I would encourage you to first deploy proper CUDA error checking.

Additionally, I consider this pattern problematic:

int main(){

  ...
  kernel<<<...>>>(...);
}

You should always use a cudaDeviceSynchronize() or equivalent blocking function (e.g. cudaMemcpy()) after a kernel launch, and before reaching the end of main. Yes, I understand on certain platforms it may appear to be unnecessary. I would not rely on that. Do as you wish, of course.

This is also confusing to me:

			if (smem_a[ii * 128 + jj] != 0) {
				printf("result[%d][%d]=%f  ", ii, jj, smem_a[ii * 128 + jj]);

It appears to me that you are setting shared memory float locations to 1.0. When I run your code on a cc7.0 device, I see lots of printout of 1.0 values. Why are you testing against zero? What does a printout of all 1.0 values tell us? I can’t draw any conclusions from it.

A further observation:

Each thread is writing 8 float4 locations in shared memory. You have 256 threads. So that is a maximum of 1024 float4 locations in shared memory that could possibly be set to 1 (without trying to diagnose if you have overlapping indexing). But you’re allocating 81*1024 = 81KBytes of shared memory, dividing that by 16 bytes per float4 location, we have 5184 total float4 locations available in your shared allocation. Therefore after your loading of shared memory, you will have a mix of ones and zeroes. Your doubly-nested for-loops that are checking the result are iterating over 64x128 = 8192 float locations or, alternatively, 2048 float4 locations. Since you are setting a maximum of 1024 float4 locations, and iterating over 2048 float4 locations, I’m not sure what you are trying to demonstrate, but expecting all ones or all zeroes is not sensible from what I can see here.

202476410arsmart · July 28, 2022, 3:39am

Thank you!
As I mentioned, I provided two versions, the upper for ampere and lower for turing, which all follow the shared memory limit.
Also, I am actually trying to save the calculation result in the shared memory, I set them all to one just for convenience…Not familiar with curand…
Maybe you can double check why 3050 can not print all the value? Maybe it’s my device’s production problem, but I believe more likely to be…some inner limit I do not know…I am attaching my deviceQuery below for your reference:

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "NVIDIA GeForce RTX 3050 Laptop GPU"
  CUDA Driver Version / Runtime Version          11.7 / 11.7
  CUDA Capability Major/Minor version number:    8.6
  Total amount of global memory:                 4096 MBytes (4294443008 bytes)
  GPU Max Clock rate:                            1740 MHz (1.74 GHz)
  Memory Clock rate:                             6001 Mhz
  Memory Bus Width:                              128-bit
  L2 Cache Size:                                 1572864 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total shared memory per multiprocessor:        102400 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 5 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)
  Device supports Unified Addressing (UVA):      Yes
  Device supports Managed Memory:                Yes
  Device supports Compute Preemption:            Yes
  Supports Cooperative Kernel Launch:            Yes
  Supports MultiDevice Co-op Kernel Launch:      No
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 11.7, CUDA Runtime Version = 11.7, NumDevs = 1

202476410arsmart · July 28, 2022, 4:12am

Someone suggest me it might be related to: I should use “graphic memory sync? cuda barrier?” but not “thread sync”. I do not understand…Do you know what does this mean? Thank you!!!

Topic		Replies	Views
threads and blocks CUDA Programming and Performance	3	1359	May 7, 2012
Number of kilobytes transferred to/from shared memory twice the expected CUDA Programming and Performance	12	704	September 29, 2018
Memory problem? ...incredible slowdown CUDA Programming and Performance	29	16308	January 30, 2011
Error: Failed to suspend device for CUDA device 0 CUDA Programming and Performance	8	4553	January 4, 2023
GTX460 Max of 1 Block? CUDA Programming and Performance	8	7171	July 3, 2011
2D reduction using CUDA The use a cuda and cublas library for a 2D simple reduction CUDA Programming and Performance	11	4440	February 7, 2012
Grid Settings (It may be associated with Multiprocessor) CUDA Programming and Performance	2	489	October 11, 2016
Cannot achieve max shared memory bandwith CUDA Programming and Performance	12	819	November 20, 2023
Can a Kernel be too big?? CUDA_ERROR_NO_BINARY_FOR_GPU error 209 CUDA Programming and Performance	11	3047	November 13, 2017
Effective global memory bandwidth? CUDA Programming and Performance	17	17572	September 18, 2007

One thread write into shared memory has limit?

Related topics