Is it possible to "shuffle" multiply two matrices without stride information?

user366312 · August 25, 2023, 2:52am

The following source code requires an extra information called stride in its Matrix struct.

Is it possible to rewrite it without using stride?

Matrix multiplication using shuffle

# cat t15.cu
#include <iostream>
#include <cstdlib>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL


__host__ __device__ void printShMatrix(float *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      printf("%.2f ", mat[i*cc+j]);
    }
    printf("\n");
  }
  printf("\n");
}

unsigned long long dtime_usec(unsigned long long start=0){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
#define RNG 3
// Thread block size
#define BLOCK_SIZE 4
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct {
    int width;
    int height;
    int stride;
    float* elements;
} Matrix;
// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
    return A.elements[row * A.stride + col];
}
// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col,
                           float value)
{
    A.elements[row * A.stride + col] = value;
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
 __device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
    Matrix Asub;
    Asub.width    = BLOCK_SIZE;
    Asub.height   = BLOCK_SIZE;
    Asub.stride   = A.stride;
    Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row
                                         + BLOCK_SIZE * col];
    return Asub;
}

// Matrix multiplication kernel called by MatMul()
 __global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
    // Block row and column
    int blockRow = blockIdx.y;
    int blockCol = blockIdx.x;
    // Each thread block computes one sub-matrix Csub of C
    Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
    // Each thread computes one element of Csub
    // by accumulating results into Cvalue
    float Cvalue = 0;
    // Thread row and column within Csub
    int row = threadIdx.y;
    int col = threadIdx.x;
    // Loop over all the sub-matrices of A and B that are
    // required to compute Csub
    // Multiply each pair of sub-matrices together
    // and accumulate the results
    for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {
        // Get sub-matrix Asub of A
        Matrix Asub = GetSubMatrix(A, blockRow, m);
        // Get sub-matrix Bsub of B
        Matrix Bsub = GetSubMatrix(B, m, blockCol);
        // Shared memory used to store Asub and Bsub respectively
        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
        // Load Asub and Bsub from device memory to shared memory
        // Each thread loads one element of each sub-matrix
        As[row][col] = GetElement(Asub, row, col);
        Bs[row][col] = GetElement(Bsub, row, col);
        // Synchronize to make sure the sub-matrices are loaded
        // before starting the computation
        __syncthreads();
        if ((blockRow == 0) && (blockCol == 0) && (row == 0) && (col == 0)) printShMatrix(As[0], BLOCK_SIZE, BLOCK_SIZE);
        // Multiply Asub and Bsub together
        for (int e = 0; e < BLOCK_SIZE; ++e)
            Cvalue += As[row][e] * Bs[e][col];
        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        __syncthreads();
    }
    // Write Csub to device memory
    // Each thread writes one element
    SetElement(Csub, row, col, Cvalue);
}


 __global__ void MatMulKernel_shflA(Matrix A, Matrix B, Matrix C)
{
    // Block row and column
    int blockRow = blockIdx.y;
    int blockCol = blockIdx.x;
    // Each thread block computes one sub-matrix Csub of C
    Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
    // Each thread computes one element of Csub
    // by accumulating results into Cvalue
    float Cvalue = 0;
    // Thread row and column within Csub
    int row = threadIdx.y;
    int col = threadIdx.x;
    // Loop over all the sub-matrices of A and B that are
    // required to compute Csub
    // Multiply each pair of sub-matrices together
    // and accumulate the results
    for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {
        // Get sub-matrix Asub of A
        Matrix Asub = GetSubMatrix(A, blockRow, m);
        // Get sub-matrix Bsub of B
        Matrix Bsub = GetSubMatrix(B, m, blockCol);
        // Shared memory used to store Asub and Bsub respectively
        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
        float my_A = GetElement(Asub, row, col);
        Bs[row][col] = GetElement(Bsub, row, col);
        // Synchronize to make sure the sub-matrices are loaded
        // before starting the computation
        __syncthreads();
        // Multiply Asub and Bsub together
        for (int e = 0; e < BLOCK_SIZE; ++e)
            Cvalue += __shfl_sync(0xFFFFFFFF, my_A, e) * Bs[e][col];
        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        __syncthreads();
    }
    // Write Csub to device memory
    // Each thread writes one element
    SetElement(Csub, row, col, Cvalue);
}

// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
    // Load A and B to device memory
    Matrix d_A;
    d_A.width = d_A.stride = A.width; d_A.height = A.height;
    size_t size = A.width * A.height * sizeof(float);
    cudaMalloc(&d_A.elements, size);
    cudaMemcpy(d_A.elements, A.elements, size,
               cudaMemcpyHostToDevice);
    Matrix d_B;
    d_B.width = d_B.stride = B.width; d_B.height = B.height;
    size = B.width * B.height * sizeof(float);
    cudaMalloc(&d_B.elements, size);
    cudaMemcpy(d_B.elements, B.elements, size,
    cudaMemcpyHostToDevice);
    Matrix h_C_shfl;
    h_C_shfl.width = h_C_shfl.stride = C.width; h_C_shfl.height = C.height;
    h_C_shfl.elements = new float[h_C_shfl.width*h_C_shfl.height];
    // Allocate C in device memory
    Matrix d_C;
    d_C.width = d_C.stride = C.width; d_C.height = C.height;
    size = C.width * C.height * sizeof(float);
    cudaMalloc(&d_C.elements, size);
    // Invoke kernel
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
    MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C); // warm-up
    cudaDeviceSynchronize();
    unsigned long long dt = dtime_usec(0);
    MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
    cudaDeviceSynchronize();
    dt = dtime_usec(dt);
    std::cout << "shared kernel time:  " << dt << "us" << std::endl;
    cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);
#if 0
    MatMulKernel_shflA<<<dimGrid, dimBlock>>>(d_A, d_B, d_C); // warm-up
    cudaDeviceSynchronize();
    dt = dtime_usec(0);
    MatMulKernel_shflA<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
    cudaDeviceSynchronize();
    dt = dtime_usec(dt);
    std::cout << "shuffle kernel time: " << dt << "us" << std::endl;
    // Read C from device memory
    cudaMemcpy(h_C_shfl.elements, d_C.elements, size, cudaMemcpyDeviceToHost);
    for (int i = 0; i < h_C_shfl.width*h_C_shfl.height; i++) if (h_C_shfl.elements[i] != C.elements[i]) {std::cout << "mismatch at: " << i << " should be: " << C.elements[i] << " was: " << h_C_shfl.elements[i] << std::endl; return;}
#endif
    // Free device memory
    cudaFree(d_A.elements);
    cudaFree(d_B.elements);
    cudaFree(d_C.elements);
}

int main(int argc, char *argv[]){

  Matrix h_A, h_B, h_C;
  int dim = 4*BLOCK_SIZE;
  if (argc > 1) dim = atoi(argv[1])*BLOCK_SIZE;
  h_A.width=h_A.stride=h_A.height=h_B.width=h_B.stride=h_B.height=h_C.width=h_C.stride=h_C.height=dim;
  h_A.elements = new float[dim*dim];
  h_B.elements = new float[dim*dim];
  h_C.elements = new float[dim*dim];
  for (int i = 0; i < dim*dim; i++) {
    h_A.elements[i] = rand()%RNG;
    h_B.elements[i] = rand()%RNG;}
  MatMul(h_A, h_B, h_C);
}
# nvcc -o t15 t15.cu
# ./t15
1.00 0.00 2.00 1.00
0.00 2.00 1.00 1.00
2.00 1.00 1.00 0.00
1.00 0.00 0.00 2.00

0.00 2.00 2.00 2.00
0.00 0.00 2.00 2.00
0.00 2.00 2.00 0.00
2.00 2.00 0.00 1.00

0.00 1.00 2.00 0.00
2.00 2.00 1.00 2.00
1.00 0.00 2.00 0.00
1.00 2.00 0.00 1.00

2.00 2.00 1.00 2.00
2.00 1.00 0.00 2.00
2.00 0.00 2.00 1.00
1.00 0.00 0.00 0.00

1.00 0.00 2.00 1.00
0.00 2.00 1.00 1.00
2.00 1.00 1.00 0.00
1.00 0.00 0.00 2.00

0.00 2.00 2.00 2.00
0.00 0.00 2.00 2.00
0.00 2.00 2.00 0.00
2.00 2.00 0.00 1.00

0.00 1.00 2.00 0.00
2.00 2.00 1.00 2.00
1.00 0.00 2.00 0.00
1.00 2.00 0.00 1.00

2.00 2.00 1.00 2.00
2.00 1.00 0.00 2.00
2.00 0.00 2.00 1.00
1.00 0.00 0.00 0.00

shared kernel time:  1557us
#

njuffa · August 25, 2023, 6:08am

Any time one works with sub-matrices that are part of a larger matrix, a parameter like stride is needed for correctly determining the addresses of elements of sub-matrices. Depending on storage convention (row major or column major), the value of such a stride parameter, which may go by various other names, is either the width or the height of the larger matrix it is a part of.

CUBLAS uses the column-major storage convention, and in GEMM for example these parameters are called lda, ldb, and ldc (corresponding to the matrices A, B, and C) where ld stands for “leading dimension”. The posted code appears to use row-major storage and uses A.stride and B.stride in an analogous way.

Topic		Replies	Views
Warp shuffle instruction not working as expected CUDA Programming and Performance	7	839	September 6, 2023
How can I write a printMatrix kernel? CUDA Programming and Performance	17	614	August 24, 2023
Why different shape matrix multiplication have different performance? CUDA Programming and Performance	2	772	August 26, 2018
nVidia CUDA Programming Guide and shared memory CUDA Programming and Performance	0	1463	January 12, 2010
Matrix multiply with shared memory works too slow in my code CUDA Programming and Performance	2	55	October 21, 2024
CUDA Matrix Multiplication Performance CUDA Programming and Performance	3	2918	December 6, 2015
Subtle problem with matrix-matrix multiplication CUDA Programming and Performance	10	1208	December 12, 2020
Non-square Matrix Multiplication, not getting any cValues back CUDA Programming and Performance	2	3394	July 1, 2009
Matrix Multiplication Inconsistency Different values output in every run of the matixMul program CUDA Programming and Performance	29	8757	December 16, 2009
Intro CUDA - Matrix Multiplication Returning Odd Values CUDA Programming and Performance	1	5712	June 25, 2009

Is it possible to "shuffle" multiply two matrices without stride information?

Related topics