I get random wrong result for following simple matrix multiply, it is about byte multiply
#ifdef CUDA
device forceinline
#endif
void MatrixMul_32X32( uint8_t* A, uint8_t* B, uint8_t *C )
{
shared uint8_t subA[32][32];
shared uint8_t subB[32][32];
for( int i = 0; i < 256; i += 32 ) {
for( int j = 0; j < 256; j += 32 ) {
uint32_t sum = 0;
for( int k = 0; k < 256; k += 32 ) {
subA[threadIdx.y][ threadIdx.x] = A[ 256 * ( i + threadIdx.y ) + k + threadIdx.x ] ;
subB[threadIdx.y][ threadIdx.x] = B[ 256 * ( k + threadIdx.y ) + j + threadIdx.x ] ;
__syncthreads ();
for( int kk = 0; kk < 32; kk++ ) {
sum += subA[ threadIdx.y ][ kk ] * subB[ kk ][ threadIdx.x ];
}
}
C[( i + threadIdx.y ) * 256 + j + threadIdx.x ] = uint8_t(( sum + ( sum >> 8 ) ) & 0xFF );
}
}
}
#ifdef CUDA
global
#endif
void MatrixMulKernel( uint8_t* A_MatArr, uint8_t *temp_MatArr )
{
uint8_t *MA = &A_MatArr[ 256 * 256 * blockIdx.x * 2 ];
uint8_t *MB = &A_MatArr[ 256 * 256 * ( blockIdx.x * 2 + 1 )];
uint8_t *MC = &temp_MatArr[ 256 * 256 * blockIdx.x ] ;
//MatrixMul_128X128 ( MA, MB, MC );
MatrixMul_32X32 ( MA, MB, MC );
}
void testMatrixMul( uint32_t length, cudaStream_t stream, uint8_t * A_MatArr, uint8_t* tmp_MatArr )
{
MatrixMulKernel<<< length, dim3( 32, 32 ), 0, stream >>>( A_MatArr, tmp_MatArr );
cudaError_t e = cudaGetLastError( );
if( e != 0 ) {
std::clog << "launch MatrixMul kernel error: " << cudaGetErrorName( e ) << std::endl ;
}
cudaDeviceSynchronize();
}
as you can saw, I have used "__syncthreads() " method to synchronize all threads in the thread block,
one thread block with size of 32 x 32 handle on multiply of matrix 256 X 256 ;
MC = MA[256][256] x MB[256][256]
I real get frustrated by nvidia’s production , I hope one day it should be fall down.