Matrix multiplication performance issue

I used some methods to optimize matrix multiplication.
Includes Naive, shared memory, less bank conflict strategy… and so on.
What confuses me much is the performance measurement.

And here is Nsight Compute analysis.

You’ll find that the naive method is apparently better than those after optimized.

I don’t know why it turns out like that…

Every output has been compared to the standard answer(generated by cpu method).

I will send my code below the topic.


My code is like this…
If you need more code segment, I will send more.

full code here


__global__ void multiply_naive(const DTYPE* A, const DTYPE* B, DTYPE* C)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;
    DTYPE sum = 0;
    for (int k = 0; k < MATRIX_W; k++)
        sum += A[tx * MATRIX_W + k] * B[k * MATRIX_W + ty];
    C[tx * MATRIX_W + ty] = sum;
}

__global__ void multiply_block(const DTYPE* A, const DTYPE* B, DTYPE* C)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;
    DTYPE sum = 0;

    for (int m = 0; m < MATRIX_W; m += BLOCK_SIZE)
        for (int k = 0; k < BLOCK_SIZE; k++)
            sum += A[tx * MATRIX_W + (m + k)] * B[(m + k) * MATRIX_W + ty];

    C[tx * MATRIX_W + ty] = sum;
}

__global__ void multiply_shared(const DTYPE* A, const DTYPE* B, DTYPE* C)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;

    __shared__ DTYPE As[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ DTYPE Bs[BLOCK_SIZE][BLOCK_SIZE];

    DTYPE sum = 0;

    for (int m = 0; m < MATRIX_W; m += BLOCK_SIZE)
    {
        As[threadIdx.x][threadIdx.y] = A[tx * MATRIX_W + (m + threadIdx.y)];
        Bs[threadIdx.x][threadIdx.y] = B[(m + threadIdx.x) * MATRIX_W + ty];

        __syncthreads();

        for (int k = 0; k < BLOCK_SIZE; k++)
            sum += As[threadIdx.x][k] * Bs[k][threadIdx.y];

        __syncthreads();
    }

    C[tx * MATRIX_W + ty] = sum;
}

__global__ void multiply_less_conflict(const DTYPE* A, const DTYPE* B, DTYPE* C)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;

    __shared__ DTYPE As[BLOCK_SIZE][BLOCK_SIZE + 1];
    __shared__ DTYPE Bs[BLOCK_SIZE][BLOCK_SIZE + 1];

    DTYPE sum = 0;

    for (int m = 0; m < MATRIX_W; m += BLOCK_SIZE)
    {
        As[threadIdx.y][threadIdx.x] = A[tx * MATRIX_W + (m + threadIdx.y)];
        Bs[threadIdx.x][threadIdx.y] = B[(m + threadIdx.x) * MATRIX_W + ty];

        __syncthreads();

        for (int k = 0; k < BLOCK_SIZE; k++)
            sum += As[k][threadIdx.x] * Bs[k][threadIdx.y];

        __syncthreads();
    }

    C[tx * MATRIX_W + ty] = sum;
}

__global__ void multiply_unroll(const DTYPE* A, const DTYPE* B, DTYPE* C)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;

    __shared__ DTYPE As[BLOCK_SIZE][BLOCK_SIZE + 1];
    __shared__ DTYPE Bs[BLOCK_SIZE][BLOCK_SIZE + 1];

    DTYPE sum = 0;

    for (int m = 0; m < MATRIX_W; m += BLOCK_SIZE)
    {
        As[threadIdx.y][threadIdx.x] = A[tx * MATRIX_W + (m + threadIdx.y)];
        Bs[threadIdx.x][threadIdx.y] = B[(m + threadIdx.x) * MATRIX_W + ty];

        __syncthreads();

        #pragma unroll
        for (int k = 0; k < BLOCK_SIZE; k++)
            sum += As[k][threadIdx.x] * Bs[k][threadIdx.y];

        __syncthreads();
    }

    C[tx * MATRIX_W + ty] = sum;
}


const int MATRIX_H = 1024;
const int MATRIX_W = 1024;
const int BLOCK_SIZE = 32;

Matrix a, b, res, tmp;
DTYPE *d_a, *d_b, *d_tmp;

int gpu_check(const char* method, void (*kernal)(const DTYPE*, const DTYPE*, DTYPE*),
              int loop = 7)
{
    GpuTimer timer;
    std::vector<int> time_rec;

    for (int i = 1; i <= loop; i++)
    {
        dim3 block(BLOCK_SIZE, BLOCK_SIZE);
        dim3 grid(MATRIX_H / BLOCK_SIZE, MATRIX_W / BLOCK_SIZE);

        CHECK(cudaMemset(d_tmp, 0, MATRIX_H * MATRIX_W * sizeof(DTYPE)));

        timer.Start();
        kernal<<<grid, block>>>(d_a, d_b, d_tmp);
        CHECK(cudaDeviceSynchronize());
        timer.Stop();

        CHECK(cudaMemcpy(tmp.a, d_tmp, MATRIX_H * MATRIX_W * sizeof(DTYPE), cudaMemcpyDeviceToHost));

        int time_elapsed = static_cast<int>(timer.Elapsed());
        printf("%s: %d ms\n", method, time_elapsed);

        if (!check_mat(res, tmp))
        {
            printf("Not correct result! method: %s\n", method);
            throw "wrong result";
        }
        time_rec.emplace_back(time_elapsed);
    }

    std::sort(time_rec.begin(), time_rec.end());
    return time_rec[(time_rec.size() - 1) >> 1];
}

void Init()
{
    srand(time(NULL));
    a.generate();
    b.generate();
    Multiply(a, b, res);

    CHECK(cudaMalloc((void **)&d_a, MATRIX_H * MATRIX_W * sizeof(DTYPE)));
    CHECK(cudaMalloc((void **)&d_b, MATRIX_H * MATRIX_W * sizeof(DTYPE)));
    CHECK(cudaMalloc((void **)&d_tmp, MATRIX_H * MATRIX_W * sizeof(DTYPE)));
    CHECK(cudaMemcpy(d_a, a.a, MATRIX_H * MATRIX_W * sizeof(DTYPE), cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_b, b.a, MATRIX_H * MATRIX_W * sizeof(DTYPE), cudaMemcpyHostToDevice));
}

void Return()
{
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_tmp);
}

int main()
{
    Init();

    printf("GPU version:\n");
    printf("average: %d ms\n\n", gpu_check("Naive", multiply_naive));
    printf("average: %d ms\n\n", gpu_check("Block", multiply_block));
    printf("average: %d ms\n\n", gpu_check("Shared memory", multiply_shared));
    printf("average: %d ms\n\n", gpu_check("less conflict", multiply_less_conflict));
    printf("average: %d ms\n\n", gpu_check("Unroll", multiply_unroll));

    Return();
    return 0;
}

please don’t paste pictures of code on this forum. Please use the forum tools to properly format your code as text.

Paste the code into an edit window
select the code
press the </> button at the top of the edit window
save your changes.

Please fix it now.
I also recommend posting a complete code, not just the kernels.

A common error people make in these situations is to profile code that has been generated using a debug project (e.g. windows) or using the debug compilation switch (-G) for device code.

If you are doing that, it may be the explanation. If you are interested in performance analysis, you should not be compiling debug device code.

Thank you for you replying!

Turning off -Goption really works as expected.

Really sorry for troubling you because of pasting picture of code, and I would use text to send my code next time.

Thank you again for your genuine advice!

You also have a common pattern of using tx to select the matrix row and ty to select the matrix column, leading to element indexing calculations such as tx * MATRIX_W + ty.

This is generally not conducive to good memory access patterns across the warp, so refactoring your code to use the alternate pattern ty * MATRIX_W + tx (i.e. use ty to select the matrix row and tx to select the matrix column) may result in generally better performance.

More efficient “naive” and “shared” matrix multiply codes can be found in the programming guide. For best performance, the usual suggestion would be to use a high-quality library such as CUBLAS.

You mean memory coalescing, right? That’s a really good advice.

In fact, I plan to write that in a new function and make comparison… But forgot finally.

By the way, I found cublas performs a little worse in accuracy. Which means when I verify the result, relative error about 1e-5 is needed.

You may be making an error again. There’s no way to tell from the code you have posted. I don’t know of any reason why cublas would have systemically worse accuracy than for example your naive test case or shared memory test case.

I can’t tell what your DTYPE is, but the 1e-5 error bound seems possibly typical of using float datatype. If your DTYPE is actually double then that would not be a valid comparison.

If you assume that the values (products) you add up as sum have the same exponent, then a better addition is like a tree instead of a loop.

It is called pairwise or cascaded summation:

DTYPE is actually float, and here are the cublas function and result.

void multiply_cublas(const DTYPE* A, const DTYPE* B, DTYPE* C)
{
    cublasHandle_t handle;
    cublasCreate(&handle);

    const DTYPE alpha = 1, beta = 0;
    cublasSgemm(
        handle,
        CUBLAS_OP_N, CUBLAS_OP_N,
        MATRIX_H, MATRIX_W, MATRIX_H,
        &alpha,
        B, MATRIX_H,
        A, MATRIX_W,
        &beta,
        C, MATRIX_H
    );

    cublasDestroy(handle);
}
int gpu_check(const char* method, void (*kernal)(const DTYPE*, const DTYPE*, DTYPE*),
              int loop = 1)
{
    GpuTimer timer;
    std::vector<int> time_rec;

    for (int i = 1; i <= loop; i++)
    {
        dim3 block(BLOCK_SIZE, BLOCK_SIZE);
        dim3 grid(MATRIX_H / BLOCK_SIZE, MATRIX_W / BLOCK_SIZE);

        CHECK(cudaMemset(d_tmp, 0, MATRIX_H * MATRIX_W * sizeof(DTYPE)));

        timer.Start();
        kernal<<<grid, block>>>(d_a, d_b, d_tmp);
        CHECK(cudaDeviceSynchronize());
        timer.Stop();

        CHECK(cudaMemcpy(tmp.a, d_tmp, MATRIX_H * MATRIX_W * sizeof(DTYPE), cudaMemcpyDeviceToHost));

        int time_elapsed = static_cast<int>(timer.Elapsed());
        printf("%s: %d ms\n", method, time_elapsed);

        // if (!check_mat(res, tmp))
        // {
        //     printf("Not correct result! method: %s\n", method);
        //     throw "wrong result";
        // }
        for (int i = 0; i < 5 ; i++)
            for (int j = 0; j < 5; j++)
                printf("(%d, %d) %f, %f\n", i, j, res[i][j], tmp[i][j]);
        time_rec.emplace_back(time_elapsed);
    }

    std::sort(time_rec.begin(), time_rec.end());
    return time_rec[(time_rec.size() - 1) >> 1];
}
GPU version:
cublas: 39 ms
(0, 0) 270538880.000000, 270539104.000000
(0, 1) 267656240.000000, 267656128.000000
(0, 2) 272429184.000000, 272429152.000000
(0, 3) 269264896.000000, 269264832.000000
(0, 4) 264540864.000000, 264541088.000000
(1, 0) 262382592.000000, 262382816.000000
(1, 1) 271015744.000000, 271015744.000000
(1, 2) 260825536.000000, 260825408.000000
(1, 3) 261286400.000000, 261286240.000000
(1, 4) 259316672.000000, 259316784.000000
(2, 0) 272900736.000000, 272900800.000000
(2, 1) 272983456.000000, 272983520.000000
(2, 2) 269640256.000000, 269640000.000000
(2, 3) 273878208.000000, 273878208.000000
(2, 4) 272137504.000000, 272137664.000000
(3, 0) 271197664.000000, 271197824.000000
(3, 1) 273098336.000000, 273098304.000000
(3, 2) 268501184.000000, 268501120.000000
(3, 3) 263617552.000000, 263617536.000000
(3, 4) 266090336.000000, 266090256.000000
(4, 0) 262191408.000000, 262191392.000000
(4, 1) 264995696.000000, 264995712.000000
(4, 2) 262589776.000000, 262590016.000000
(4, 3) 261433888.000000, 261433920.000000
(4, 4) 258600400.000000, 258600416.000000

In addition, here is Matrix type definition.

class Matrix
{
public:
    Matrix() = default;

    DTYPE a[MATRIX_H][MATRIX_W];

    DTYPE* operator [](const int& x)
    {
        return a[x];
    }

    const DTYPE* operator [](const int& x) const
    {
        return a[x];
    }

    void generate();
    void init();
};

You will find the relative error exist. I don’t know is my fault or cublas’s.

A relative error does not mean cublas is the issue. The cublas result might be more accurate than the other. And you still haven’t provided a complete code that I could compile, run, and test.

Good luck!

Alright, I thought about send my complete code here, but several files may cost to much space…
I just wonder if sending my whole project here is a better way.

Thank you for your attention to my question.
matrix_multiplication.zip (8.0 KB)

My environment versions may help:
Host OS: Win11 24h2
Host Processor: 14600kf
CUDA: 12.8.93
Display Driver: 576.02
Device Name: RTX 4070 Ti SUPER

You could calculate with double precision to see, which of both is more accurate in your case.

That’s what I did. I modified the code in 2 ways:

  1. Made the random seed fixed, so that the generate function will produce the same input matrix data.
  2. completed the double calculation path (basically add a cublasDgemm call) selectable at compile-time.

When I do that I get results like this:

FLOAT CALCULATION (CPU, GPU/CUBLAS)
(0, 0) 267569632.000000, 267569632.000000
(0, 1) 266916400.000000, 266916512.000000
(0, 2) 268765760.000000, 268765632.000000
(0, 3) 264223248.000000, 264223264.000000
(0, 4) 255403552.000000, 255403584.000000
(1, 0) 273801568.000000, 273801600.000000
(1, 1) 278523488.000000, 278523392.000000
(1, 2) 276506720.000000, 276506752.000000
(1, 3) 274823584.000000, 274823552.000000
(1, 4) 265850304.000000, 265850160.000000
(2, 0) 259909680.000000, 259909792.000000
(2, 1) 264786000.000000, 264785952.000000
(2, 2) 265651488.000000, 265651424.000000
(2, 3) 260584064.000000, 260584144.000000
(2, 4) 253351904.000000, 253351792.000000
(3, 0) 261354752.000000, 261354592.000000
(3, 1) 265288320.000000, 265288576.000000
(3, 2) 260826784.000000, 260826848.000000
(3, 3) 267221984.000000, 267221888.000000
(3, 4) 255389024.000000, 255388832.000000
(4, 0) 261693456.000000, 261693344.000000
(4, 1) 274166112.000000, 274166016.000000
(4, 2) 266335712.000000, 266335776.000000
(4, 3) 267844144.000000, 267844160.000000
(4, 4) 257772992.000000, 257773072.000000

DOUBLE CALCULATION (CPU, GPU/CUBLAS)
(0, 0) 267569561.000000, 267569561.000000
(0, 1) 266916533.000000, 266916533.000000
(0, 2) 268765654.000000, 268765654.000000
(0, 3) 264223253.000000, 264223253.000000
(0, 4) 255403590.000000, 255403590.000000
(1, 0) 273801644.000000, 273801644.000000
(1, 1) 278523489.000000, 278523489.000000
(1, 2) 276506714.000000, 276506714.000000
(1, 3) 274823532.000000, 274823532.000000
(1, 4) 265850147.000000, 265850147.000000
(2, 0) 259909803.000000, 259909803.000000
(2, 1) 264785989.000000, 264785989.000000
(2, 2) 265651427.000000, 265651427.000000
(2, 3) 260584163.000000, 260584163.000000
(2, 4) 253351797.000000, 253351797.000000
(3, 0) 261354579.000000, 261354579.000000
(3, 1) 265288529.000000, 265288529.000000
(3, 2) 260826798.000000, 260826798.000000
(3, 3) 267221847.000000, 267221847.000000
(3, 4) 255388903.000000, 255388903.000000
(4, 0) 261693373.000000, 261693373.000000
(4, 1) 274165966.000000, 274165966.000000
(4, 2) 266335782.000000, 266335782.000000
(4, 3) 267844167.000000, 267844167.000000
(4, 4) 257773075.000000, 257773075.000000

First of all I’ll just advance without proof the notion that the double calculation represents the “correct” values. This is generally supported (although not proven) by the observation that the CPU and GPU(CUBLAS) results match exactly. Given that, we can look at specific differences in the float case. I’ll just look at the first 3 lines of output. Comparing the first line of output:

FLOAT:  (0, 0) 267569632.000000, 267569632.000000
DOUBLE: (0, 0) 267569561.000000, 267569561.000000

Here we see that even in the float case, the results between CPU and GPU(CUBLAS) match. Therefore although neither one matches the “golden” double result, they are equidistant from it. Neither is more or less “accurate”. In the next line:

FLOAT:  (0, 1) 266916400.000000, 266916512.000000
DOUBLE: (0, 1) 266916533.000000, 266916533.000000

we see that the CPU (first) result is farther from the double value than the GPU (second) result. If we wish, we could say that particular float GPU result is “more accurate” than the corresponding CPU result. In the 3rd line:

FLOAT:  (0, 2) 268765760.000000, 268765632.000000
DOUBLE: (0, 2) 268765654.000000, 268765654.000000

we see the same thing: the CPU result appears to be farther away from the “golden” double calculation.

This doesn’t preclude the possibility that you may find situations in the output where the CUBLAS/GPU result is “farther away” from the “golden” value than the CPU result. I didn’t spot any but didn’t look hard. However, I don’t see any data that would support a general conclusion that CUBLAS is somehow “less accurate” than something else.

As an off-topic aside, I witnessed something I had never seen before in your code; I would not have guessed a-priori that this would have worked, but it seems to.

The gpu_check routine (in gpu_main.cu) accepts a function pointer. You are variously passing pointers to “ordinary” functions as well as those decorated with __global__. Within the gpu_check routine, you are configuring this function pointer call as follows:


int gpu_check(const char* method, void (*kernal)(const DTYPE*, const DTYPE*, DTYPE*),
              int loop = 1)
{
    ...
        kernal<<<grid, block>>>(d_a, d_b, d_tmp);

in particular, you call it with:

printf("average: %d ms\n\n", gpu_check("cublas", multiply_cublas));

where multiply_cublas (in matrix_gpu.cu) is not decorated with __global__. It’s amazing to me that seems to work, giving the desired behavior whether a kernel function pointer is passed, or an ordinary function pointer is passed (in the later case it seems that the kernel launch config is simply ignored).

Interesting/weird.

2 Likes

I totally understand what you mean. Even the cpu float calculation result is not accurate, right?

That’s really my fault that I regard the cpu result as the golden value.

Maybe the following result misled me:

Naive: 7 ms
(0, 0) 272445280.000000, 272445280.000000
(0, 1) 272732736.000000, 272732736.000000
(0, 2) 269240832.000000, 269240832.000000
(0, 3) 264895952.000000, 264895952.000000
(0, 4) 268468896.000000, 268468896.000000
......

Block: 7 ms
(0, 0) 272445280.000000, 272445280.000000
(0, 1) 272732736.000000, 272732736.000000
(0, 2) 269240832.000000, 269240832.000000
(0, 3) 264895952.000000, 264895952.000000
(0, 4) 268468896.000000, 268468896.000000
(1, 0) 258575440.000000, 258575440.000000
(1, 1) 259670240.000000, 259670240.000000
(1, 2) 257178240.000000, 257178240.000000
(1, 3) 259349312.000000, 259349312.000000
(1, 4) 259792512.000000, 259792512.000000
(2, 0) 274808320.000000, 274808320.000000
(2, 1) 280529408.000000, 280529408.000000
(2, 2) 277083168.000000, 277083168.000000
(2, 3) 276412352.000000, 276412352.000000
(2, 4) 281763360.000000, 281763360.000000
(3, 0) 264192432.000000, 264192432.000000
(3, 1) 265752528.000000, 265752528.000000
(3, 2) 262523520.000000, 262523520.000000
......

Shared memory: 8 ms
(0, 0) 272445280.000000, 272445280.000000
(0, 1) 272732736.000000, 272732736.000000
(0, 2) 269240832.000000, 269240832.000000
(0, 3) 264895952.000000, 264895952.000000
(0, 4) 268468896.000000, 268468896.000000
(1, 0) 258575440.000000, 258575440.000000
(1, 1) 259670240.000000, 259670240.000000
(1, 2) 257178240.000000, 257178240.000000
(1, 3) 259349312.000000, 259349312.000000
(1, 4) 259792512.000000, 259792512.000000
(2, 0) 274808320.000000, 274808320.000000
(2, 1) 280529408.000000, 280529408.000000
(2, 2) 277083168.000000, 277083168.000000
......

Results are perfectly the same.

Now, I thought it could just indicate that my gpu methods have same errors as cpu methods. : )

Oh, I didn’t notice that I call a host function by kernal<<<grid, block>>>(d_a, d_b, d_tmp);
That’s really an accident, and I really want to figure it out if possible.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.