Hi! I am writing 64 values into shared memory in one thread, and 256 threads are doing the same thing into different shared memory location, generally will fill 256*64 values in shared memory. But I find out, some lines will randomly loss!??? Each time different!!! I promise I have used __syncthreads(). I am wondering whether any restriction here.
Thank you!!!
#include<iostream>
using namespace std;
#include "cuda_runtime.h"
#define FETCH_FLOAT4(pointer) (reinterpret_cast<float4*>(&(pointer))[0])
__global__ void test()
{
extern __shared__ __align__(16 * 1024) float smem[];
float* smem_a = smem;
int tx16 = threadIdx.x % 16;
int ty16 = threadIdx.x / 16;
float4 f4_rand = make_float4(0, 0, 0, 0);
float4 c[8][2] = { { f4_rand } };
{
c[0][0].x = 1, c[0][0].y = 1, c[0][0].z = 1, c[0][0].w = 1, c[0][1].x = 1, c[0][1].y = 1, c[0][1].z = 1, c[0][1].w = 1, c[1][0].x = 1, c[1][0].y = 1, c[1][0].z = 1, c[1][0].w = 1, c[1][1].x = 1, c[1][1].y = 1, c[1][1].z = 1, c[1][1].w = 1, c[2][0].x = 1, c[2][0].y = 1, c[2][0].z = 1, c[2][0].w = 1, c[2][1].x = 1, c[2][1].y = 1, c[2][1].z = 1, c[2][1].w = 1, c[3][0].x = 1, c[3][0].y = 1, c[3][0].z = 1, c[3][0].w = 1, c[3][1].x = 1, c[3][1].y = 1, c[3][1].z = 1, c[3][1].w = 1, c[4][0].x = 1, c[4][0].y = 1, c[4][0].z = 1, c[4][0].w = 1, c[4][1].x = 1, c[4][1].y = 1, c[4][1].z = 1, c[4][1].w = 1, c[5][0].x = 1, c[5][0].y = 1, c[5][0].z = 1, c[5][0].w = 1, c[5][1].x = 1, c[5][1].y = 1, c[5][1].z = 1, c[5][1].w = 1, c[6][1].x = 1, c[6][1].y = 1, c[6][1].z = 1, c[6][1].w = 1, c[6][0].x = 1, c[6][0].y = 1, c[6][0].z = 1, c[6][0].w = 1, c[7][0].x = 1, c[7][0].y = 1, c[7][0].z = 1, c[7][0].w = 1, c[7][1].x = 1, c[7][1].y = 1, c[7][1].z = 1, c[7][1].w = 1;
} // All set to 1
FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*0]) = c[0][0];
FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128 * 1]) = c[0][1];
FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*2]) = c[1][0];
FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*3]) = c[1][1];
FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 0+64]) = c[4][0];
FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 1 + 64]) = c[4][1];
FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 2 + 64]) = c[5][0];
FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 3 + 64]) = c[5][1];
__syncthreads();
if (threadIdx.x == 0 && blockIdx.x == 0) {
for (int ii = 0; ii < 128; ii++) {
for (int jj = 0; jj < 128; jj++) {
if (smem_a[ii * 128 + jj] != 0) {
printf("result[%d][%d]=%f ", ii, jj, smem_a[ii * 128 + jj]);
}
}
printf("\n");
}
printf("\n");
}
__syncthreads();}
int main(){
dim3 grid(1, 1);
int maxbytes = 81 * 1024; // 81 KB
cudaFuncSetAttribute(test, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
test << <grid, 256, 81 * 1024 >> > ();}
I am using ampere structure, but no worry, if you are turing strucutre like 2070 or 1650, try this:
#include<iostream>
using namespace std;
#include "cuda_runtime.h"
#define FETCH_FLOAT4(pointer) (reinterpret_cast<float4*>(&(pointer))[0])
__global__ void test()
{
extern __shared__ __align__(16 * 1024) float smem[];
float* smem_a = smem;
int tx16 = threadIdx.x % 16;
int ty16 = threadIdx.x / 16;
float4 f4_rand = make_float4(0, 0, 0, 0);
float4 c[8][2] = { { f4_rand } };
{
c[0][0].x = 1, c[0][0].y = 1, c[0][0].z = 1, c[0][0].w = 1, c[0][1].x = 1, c[0][1].y = 1, c[0][1].z = 1, c[0][1].w = 1, c[1][0].x = 1, c[1][0].y = 1, c[1][0].z = 1, c[1][0].w = 1, c[1][1].x = 1, c[1][1].y = 1, c[1][1].z = 1, c[1][1].w = 1, c[2][0].x = 1, c[2][0].y = 1, c[2][0].z = 1, c[2][0].w = 1, c[2][1].x = 1, c[2][1].y = 1, c[2][1].z = 1, c[2][1].w = 1, c[3][0].x = 1, c[3][0].y = 1, c[3][0].z = 1, c[3][0].w = 1, c[3][1].x = 1, c[3][1].y = 1, c[3][1].z = 1, c[3][1].w = 1, c[4][0].x = 1, c[4][0].y = 1, c[4][0].z = 1, c[4][0].w = 1, c[4][1].x = 1, c[4][1].y = 1, c[4][1].z = 1, c[4][1].w = 1, c[5][0].x = 1, c[5][0].y = 1, c[5][0].z = 1, c[5][0].w = 1, c[5][1].x = 1, c[5][1].y = 1, c[5][1].z = 1, c[5][1].w = 1, c[6][1].x = 1, c[6][1].y = 1, c[6][1].z = 1, c[6][1].w = 1, c[6][0].x = 1, c[6][0].y = 1, c[6][0].z = 1, c[6][0].w = 1, c[7][0].x = 1, c[7][0].y = 1, c[7][0].z = 1, c[7][0].w = 1, c[7][1].x = 1, c[7][1].y = 1, c[7][1].z = 1, c[7][1].w = 1;
}
FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*0]) = c[0][0];
FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128 * 1]) = c[0][1];
FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*2]) = c[1][0];
FETCH_FLOAT4(smem_a[tx16 * 4*128 + ty16 * 4 + 128*3]) = c[1][1];
FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 0+64]) = c[4][0];
FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 1 + 64]) = c[4][1];
FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 2 + 64]) = c[5][0];
FETCH_FLOAT4(smem_a[tx16 * 4 * 128 + ty16 * 4 + 128 * 3 + 64]) = c[5][1];
__syncthreads();
if (threadIdx.x == 0 && blockIdx.x == 0) {
for (int ii = 0; ii < 64; ii++) {
for (int jj = 0; jj < 128; jj++) {
if (smem_a[ii * 128 + jj] != 0) {
printf("result[%d][%d]=%f ", ii, jj, smem_a[ii * 128 + jj]);
}
}
printf("\n");
}
printf("\n");
}
__syncthreads();}
int main(){
dim3 grid(1, 1);
int maxbytes = 64 * 1024; // 81 KB
CHECK(cudaFuncSetAttribute(test, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes));
test << <grid, 256, 64 * 1024 >> > ();
cudaDeviceSynchronize();
}