is there any bug that supporting byte read/write concurrently in share memory ?

I get random wrong result for following simple matrix multiply, it is about byte multiply

#ifdef CUDA
device forceinline
#endif
void MatrixMul_32X32( uint8_t* A, uint8_t* B, uint8_t *C )
{
shared uint8_t subA[32][32];
shared uint8_t subB[32][32];

for( int i = 0; i < 256; i += 32 ) {  
    for( int j = 0; j < 256; j += 32 ) {  
        uint32_t sum = 0; 
        for( int k = 0; k < 256; k += 32  ) {  
            subA[threadIdx.y][ threadIdx.x] = A[ 256 * ( i + threadIdx.y ) + k + threadIdx.x ] ;
            subB[threadIdx.y][ threadIdx.x] = B[ 256 * ( k + threadIdx.y ) + j + threadIdx.x ] ;
            __syncthreads ();
            for( int kk = 0; kk < 32; kk++ ) {  
                sum += subA[ threadIdx.y ][ kk ] * subB[ kk ][ threadIdx.x ];
            }
        }
        C[( i + threadIdx.y ) * 256 + j + threadIdx.x ] = uint8_t(( sum + ( sum >> 8 ) ) & 0xFF );
    }
}    

}

#ifdef CUDA
global
#endif
void MatrixMulKernel( uint8_t* A_MatArr, uint8_t *temp_MatArr )
{
uint8_t *MA = &A_MatArr[ 256 * 256 * blockIdx.x * 2 ];
uint8_t *MB = &A_MatArr[ 256 * 256 * ( blockIdx.x * 2 + 1 )];
uint8_t *MC = &temp_MatArr[ 256 * 256 * blockIdx.x ] ;
//MatrixMul_128X128 ( MA, MB, MC );
MatrixMul_32X32 ( MA, MB, MC );
}

void testMatrixMul( uint32_t length, cudaStream_t stream, uint8_t * A_MatArr, uint8_t* tmp_MatArr )
{
MatrixMulKernel<<< length, dim3( 32, 32 ), 0, stream >>>( A_MatArr, tmp_MatArr );
cudaError_t e = cudaGetLastError( );
if( e != 0 ) {
std::clog << "launch MatrixMul kernel error: " << cudaGetErrorName( e ) << std::endl ;
}
cudaDeviceSynchronize();
}

as you can saw, I have used "__syncthreads() " method to synchronize all threads in the thread block,
one thread block with size of 32 x 32 handle on multiply of matrix 256 X 256 ;

MC = MA[256][256] x MB[256][256]

I real get frustrated by nvidia’s production , I hope one day it should be fall down.

I real get frustrated by nvidia’s production , I hope one day it should be fall down.

Your wish has been granted. NVDA stock is falling.

I don’t seem to have any trouble with your code. I always get the correct results, no variability that I can see.

$ cat t330.cu
#include <iostream>
#include <stdint.h>
#define CUDA

#ifdef CUDA
__device__ __forceinline__
#endif
void MatrixMul_32X32( uint8_t* A, uint8_t* B, uint8_t *C )
{
  __shared__ uint8_t subA[32][32];
  __shared__ uint8_t subB[32][32];

  for( int i = 0; i < 256; i += 32 ) {
    for( int j = 0; j < 256; j += 32 ) {
      uint32_t sum = 0;
      for( int k = 0; k < 256; k += 32 ) {
        subA[threadIdx.y][ threadIdx.x] = A[ 256 * ( i + threadIdx.y ) + k + threadIdx.x ] ;
        subB[threadIdx.y][ threadIdx.x] = B[ 256 * ( k + threadIdx.y ) + j + threadIdx.x ] ;
        __syncthreads ();
        for( int kk = 0; kk < 32; kk++ ) {
          sum += subA[ threadIdx.y ][ kk ] * subB[ kk ][ threadIdx.x ];
          }
        }
      C[( i + threadIdx.y ) * 256 + j + threadIdx.x ] = uint8_t(( sum + ( sum >> 8 ) ) & 0xFF );
    }
  }
}


#ifdef CUDA
__global__
#endif
void MatrixMulKernel( uint8_t* A_MatArr, uint8_t *temp_MatArr )
{
  uint8_t *MA = &A_MatArr[ 256 * 256 * blockIdx.x * 2 ];
  uint8_t *MB = &A_MatArr[ 256 * 256 * ( blockIdx.x * 2 + 1 )];
  uint8_t *MC = &temp_MatArr[ 256 * 256 * blockIdx.x ] ;
//MatrixMul_128X128 ( MA, MB, MC );
  MatrixMul_32X32 ( MA, MB, MC );
}


void testMatrixMul( uint32_t length, cudaStream_t stream, uint8_t * A_MatArr, uint8_t* tmp_MatArr )
{
  MatrixMulKernel<<< length, dim3( 32, 32 ), 0, stream >>>( A_MatArr, tmp_MatArr );
  cudaError_t e = cudaGetLastError( );
  if( e != 0 ) {
    std::clog << "launch MatrixMul kernel error: " << cudaGetErrorName( e ) << std::endl ;
  }
  cudaDeviceSynchronize();
}

int main(){
  uint8_t *A_MatArr, *tmp_MatArr, *h_result;
  cudaMalloc(&A_MatArr, 512*256);
  cudaMalloc(&tmp_MatArr, 256*256);
  cudaMemset(A_MatArr, 1, 512*256);
  cudaMemset(tmp_MatArr, 0, 256*256);
  h_result = new uint8_t[256*256];
  testMatrixMul( 1, 0, A_MatArr, tmp_MatArr );
  cudaMemcpy(h_result, tmp_MatArr, 256*256, cudaMemcpyDeviceToHost);
  for (int i = 0; i < 256*256; i++) if (h_result[i] != 1) {std::cout << "error at: " << i << " was: " << h_result[i] << " should be 1" << std::endl; return -1;}
  return 0;
}
$ nvcc -o t330 t330.cu
$ cuda-memcheck ./t330
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck ./t330
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck ./t330
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck ./t330
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck ./t330
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

I used the code from #3, and I don’t see any issues either. Windows 7, CUDA 8, Quadro P2000. Compiled and ran as follows:

C:\Users\Norbert\My Programs>nvcc -arch=sm_61 -o t330.exe t330.cu
nvcc warning : nvcc support for Microsoft Visual Studio 2010 and earlier has been deprecated and is no longer being maintained
t330.cu
support for Microsoft Visual Studio 2010 has been deprecated!
   Creating library t330.lib and object t330.exp

C:\Users\Norbert\My Programs>cuda-memcheck t330.exe
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors

C:\Users\Norbert\My Programs>cuda-memcheck t330.exe
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors

C:\Users\Norbert\My Programs>cuda-memcheck t330.exe
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors

I thought of buying a few, but there is a KTM Adventure waiting for me next summer.