Matrix multiplication performance issue

L1ttle · May 27, 2025, 9:14am

I used some methods to optimize matrix multiplication.
Includes Naive, shared memory, less bank conflict strategy… and so on.
What confuses me much is the performance measurement.

And here is Nsight Compute analysis.

You’ll find that the naive method is apparently better than those after optimized.

I don’t know why it turns out like that…

Every output has been compared to the standard answer(generated by cpu method).

I will send my code below the topic.

L1ttle · May 27, 2025, 9:16am

My code is like this…
If you need more code segment, I will send more.

full code here


__global__ void multiply_naive(const DTYPE* A, const DTYPE* B, DTYPE* C)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;
    DTYPE sum = 0;
    for (int k = 0; k < MATRIX_W; k++)
        sum += A[tx * MATRIX_W + k] * B[k * MATRIX_W + ty];
    C[tx * MATRIX_W + ty] = sum;
}

__global__ void multiply_block(const DTYPE* A, const DTYPE* B, DTYPE* C)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;
    DTYPE sum = 0;

    for (int m = 0; m < MATRIX_W; m += BLOCK_SIZE)
        for (int k = 0; k < BLOCK_SIZE; k++)
            sum += A[tx * MATRIX_W + (m + k)] * B[(m + k) * MATRIX_W + ty];

    C[tx * MATRIX_W + ty] = sum;
}

__global__ void multiply_shared(const DTYPE* A, const DTYPE* B, DTYPE* C)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;

    __shared__ DTYPE As[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ DTYPE Bs[BLOCK_SIZE][BLOCK_SIZE];

    DTYPE sum = 0;

    for (int m = 0; m < MATRIX_W; m += BLOCK_SIZE)
    {
        As[threadIdx.x][threadIdx.y] = A[tx * MATRIX_W + (m + threadIdx.y)];
        Bs[threadIdx.x][threadIdx.y] = B[(m + threadIdx.x) * MATRIX_W + ty];

        __syncthreads();

        for (int k = 0; k < BLOCK_SIZE; k++)
            sum += As[threadIdx.x][k] * Bs[k][threadIdx.y];

        __syncthreads();
    }

    C[tx * MATRIX_W + ty] = sum;
}

__global__ void multiply_less_conflict(const DTYPE* A, const DTYPE* B, DTYPE* C)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;

    __shared__ DTYPE As[BLOCK_SIZE][BLOCK_SIZE + 1];
    __shared__ DTYPE Bs[BLOCK_SIZE][BLOCK_SIZE + 1];

    DTYPE sum = 0;

    for (int m = 0; m < MATRIX_W; m += BLOCK_SIZE)
    {
        As[threadIdx.y][threadIdx.x] = A[tx * MATRIX_W + (m + threadIdx.y)];
        Bs[threadIdx.x][threadIdx.y] = B[(m + threadIdx.x) * MATRIX_W + ty];

        __syncthreads();

        for (int k = 0; k < BLOCK_SIZE; k++)
            sum += As[k][threadIdx.x] * Bs[k][threadIdx.y];

        __syncthreads();
    }

    C[tx * MATRIX_W + ty] = sum;
}

__global__ void multiply_unroll(const DTYPE* A, const DTYPE* B, DTYPE* C)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;

    __shared__ DTYPE As[BLOCK_SIZE][BLOCK_SIZE + 1];
    __shared__ DTYPE Bs[BLOCK_SIZE][BLOCK_SIZE + 1];

    DTYPE sum = 0;

    for (int m = 0; m < MATRIX_W; m += BLOCK_SIZE)
    {
        As[threadIdx.y][threadIdx.x] = A[tx * MATRIX_W + (m + threadIdx.y)];
        Bs[threadIdx.x][threadIdx.y] = B[(m + threadIdx.x) * MATRIX_W + ty];

        __syncthreads();

        #pragma unroll
        for (int k = 0; k < BLOCK_SIZE; k++)
            sum += As[k][threadIdx.x] * Bs[k][threadIdx.y];

        __syncthreads();
    }

    C[tx * MATRIX_W + ty] = sum;
}


const int MATRIX_H = 1024;
const int MATRIX_W = 1024;
const int BLOCK_SIZE = 32;

Matrix a, b, res, tmp;
DTYPE *d_a, *d_b, *d_tmp;

int gpu_check(const char* method, void (*kernal)(const DTYPE*, const DTYPE*, DTYPE*),
              int loop = 7)
{
    GpuTimer timer;
    std::vector<int> time_rec;

    for (int i = 1; i <= loop; i++)
    {
        dim3 block(BLOCK_SIZE, BLOCK_SIZE);
        dim3 grid(MATRIX_H / BLOCK_SIZE, MATRIX_W / BLOCK_SIZE);

        CHECK(cudaMemset(d_tmp, 0, MATRIX_H * MATRIX_W * sizeof(DTYPE)));

        timer.Start();
        kernal<<<grid, block>>>(d_a, d_b, d_tmp);
        CHECK(cudaDeviceSynchronize());
        timer.Stop();

        CHECK(cudaMemcpy(tmp.a, d_tmp, MATRIX_H * MATRIX_W * sizeof(DTYPE), cudaMemcpyDeviceToHost));

        int time_elapsed = static_cast<int>(timer.Elapsed());
        printf("%s: %d ms\n", method, time_elapsed);

        if (!check_mat(res, tmp))
        {
            printf("Not correct result! method: %s\n", method);
            throw "wrong result";
        }
        time_rec.emplace_back(time_elapsed);
    }

    std::sort(time_rec.begin(), time_rec.end());
    return time_rec[(time_rec.size() - 1) >> 1];
}

void Init()
{
    srand(time(NULL));
    a.generate();
    b.generate();
    Multiply(a, b, res);

    CHECK(cudaMalloc((void **)&d_a, MATRIX_H * MATRIX_W * sizeof(DTYPE)));
    CHECK(cudaMalloc((void **)&d_b, MATRIX_H * MATRIX_W * sizeof(DTYPE)));
    CHECK(cudaMalloc((void **)&d_tmp, MATRIX_H * MATRIX_W * sizeof(DTYPE)));
    CHECK(cudaMemcpy(d_a, a.a, MATRIX_H * MATRIX_W * sizeof(DTYPE), cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_b, b.a, MATRIX_H * MATRIX_W * sizeof(DTYPE), cudaMemcpyHostToDevice));
}

void Return()
{
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_tmp);
}

int main()
{
    Init();

    printf("GPU version:\n");
    printf("average: %d ms\n\n", gpu_check("Naive", multiply_naive));
    printf("average: %d ms\n\n", gpu_check("Block", multiply_block));
    printf("average: %d ms\n\n", gpu_check("Shared memory", multiply_shared));
    printf("average: %d ms\n\n", gpu_check("less conflict", multiply_less_conflict));
    printf("average: %d ms\n\n", gpu_check("Unroll", multiply_unroll));

    Return();
    return 0;
}

Robert_Crovella · May 27, 2025, 1:32pm

please don’t paste pictures of code on this forum. Please use the forum tools to properly format your code as text.

Paste the code into an edit window
select the code
press the </> button at the top of the edit window
save your changes.

Please fix it now.
I also recommend posting a complete code, not just the kernels.

A common error people make in these situations is to profile code that has been generated using a debug project (e.g. windows) or using the debug compilation switch (-G) for device code.

If you are doing that, it may be the explanation. If you are interested in performance analysis, you should not be compiling debug device code.

L1ttle · May 27, 2025, 3:21pm

Thank you for you replying!

Turning off -Goption really works as expected.

Really sorry for troubling you because of pasting picture of code, and I would use text to send my code next time.

Thank you again for your genuine advice!

Robert_Crovella · May 27, 2025, 3:35pm

You also have a common pattern of using tx to select the matrix row and ty to select the matrix column, leading to element indexing calculations such as tx * MATRIX_W + ty.

This is generally not conducive to good memory access patterns across the warp, so refactoring your code to use the alternate pattern ty * MATRIX_W + tx (i.e. use ty to select the matrix row and tx to select the matrix column) may result in generally better performance.

More efficient “naive” and “shared” matrix multiply codes can be found in the programming guide. For best performance, the usual suggestion would be to use a high-quality library such as CUBLAS.

L1ttle · May 27, 2025, 4:14pm

You mean memory coalescing, right? That’s a really good advice.

In fact, I plan to write that in a new function and make comparison… But forgot finally.

By the way, I found cublas performs a little worse in accuracy. Which means when I verify the result, relative error about 1e-5 is needed.

Robert_Crovella · May 27, 2025, 4:23pm

You may be making an error again. There’s no way to tell from the code you have posted. I don’t know of any reason why cublas would have systemically worse accuracy than for example your naive test case or shared memory test case.

I can’t tell what your DTYPE is, but the 1e-5 error bound seems possibly typical of using float datatype. If your DTYPE is actually double then that would not be a valid comparison.

Curefab · May 27, 2025, 6:57pm

If you assume that the values (products) you add up as sum have the same exponent, then a better addition is like a tree instead of a loop.

It is called pairwise or cascaded summation:

L1ttle · May 28, 2025, 11:24am

DTYPE is actually float, and here are the cublas function and result.

void multiply_cublas(const DTYPE* A, const DTYPE* B, DTYPE* C)
{
    cublasHandle_t handle;
    cublasCreate(&handle);

    const DTYPE alpha = 1, beta = 0;
    cublasSgemm(
        handle,
        CUBLAS_OP_N, CUBLAS_OP_N,
        MATRIX_H, MATRIX_W, MATRIX_H,
        &alpha,
        B, MATRIX_H,
        A, MATRIX_W,
        &beta,
        C, MATRIX_H
    );

    cublasDestroy(handle);
}

int gpu_check(const char* method, void (*kernal)(const DTYPE*, const DTYPE*, DTYPE*),
              int loop = 1)
{
    GpuTimer timer;
    std::vector<int> time_rec;

    for (int i = 1; i <= loop; i++)
    {
        dim3 block(BLOCK_SIZE, BLOCK_SIZE);
        dim3 grid(MATRIX_H / BLOCK_SIZE, MATRIX_W / BLOCK_SIZE);

        CHECK(cudaMemset(d_tmp, 0, MATRIX_H * MATRIX_W * sizeof(DTYPE)));

        timer.Start();
        kernal<<<grid, block>>>(d_a, d_b, d_tmp);
        CHECK(cudaDeviceSynchronize());
        timer.Stop();

        CHECK(cudaMemcpy(tmp.a, d_tmp, MATRIX_H * MATRIX_W * sizeof(DTYPE), cudaMemcpyDeviceToHost));

        int time_elapsed = static_cast<int>(timer.Elapsed());
        printf("%s: %d ms\n", method, time_elapsed);

        // if (!check_mat(res, tmp))
        // {
        //     printf("Not correct result! method: %s\n", method);
        //     throw "wrong result";
        // }
        for (int i = 0; i < 5 ; i++)
            for (int j = 0; j < 5; j++)
                printf("(%d, %d) %f, %f\n", i, j, res[i][j], tmp[i][j]);
        time_rec.emplace_back(time_elapsed);
    }

    std::sort(time_rec.begin(), time_rec.end());
    return time_rec[(time_rec.size() - 1) >> 1];
}

GPU version:
cublas: 39 ms
(0, 0) 270538880.000000, 270539104.000000
(0, 1) 267656240.000000, 267656128.000000
(0, 2) 272429184.000000, 272429152.000000
(0, 3) 269264896.000000, 269264832.000000
(0, 4) 264540864.000000, 264541088.000000
(1, 0) 262382592.000000, 262382816.000000
(1, 1) 271015744.000000, 271015744.000000
(1, 2) 260825536.000000, 260825408.000000
(1, 3) 261286400.000000, 261286240.000000
(1, 4) 259316672.000000, 259316784.000000
(2, 0) 272900736.000000, 272900800.000000
(2, 1) 272983456.000000, 272983520.000000
(2, 2) 269640256.000000, 269640000.000000
(2, 3) 273878208.000000, 273878208.000000
(2, 4) 272137504.000000, 272137664.000000
(3, 0) 271197664.000000, 271197824.000000
(3, 1) 273098336.000000, 273098304.000000
(3, 2) 268501184.000000, 268501120.000000
(3, 3) 263617552.000000, 263617536.000000
(3, 4) 266090336.000000, 266090256.000000
(4, 0) 262191408.000000, 262191392.000000
(4, 1) 264995696.000000, 264995712.000000
(4, 2) 262589776.000000, 262590016.000000
(4, 3) 261433888.000000, 261433920.000000
(4, 4) 258600400.000000, 258600416.000000

In addition, here is Matrix type definition.

class Matrix
{
public:
    Matrix() = default;

    DTYPE a[MATRIX_H][MATRIX_W];

    DTYPE* operator [](const int& x)
    {
        return a[x];
    }

    const DTYPE* operator [](const int& x) const
    {
        return a[x];
    }

    void generate();
    void init();
};

You will find the relative error exist. I don’t know is my fault or cublas’s.

Robert_Crovella · May 28, 2025, 12:59pm

A relative error does not mean cublas is the issue. The cublas result might be more accurate than the other. And you still haven’t provided a complete code that I could compile, run, and test.

Good luck!

L1ttle · May 28, 2025, 2:40pm

Alright, I thought about send my complete code here, but several files may cost to much space…
I just wonder if sending my whole project here is a better way.

Thank you for your attention to my question.
matrix_multiplication.zip (8.0 KB)

My environment versions may help:
Host OS: Win11 24h2
Host Processor: 14600kf
CUDA: 12.8.93
Display Driver: 576.02
Device Name: RTX 4070 Ti SUPER

Curefab · May 28, 2025, 5:04pm

You could calculate with double precision to see, which of both is more accurate in your case.

Robert_Crovella · May 28, 2025, 6:31pm

That’s what I did. I modified the code in 2 ways:

Made the random seed fixed, so that the generate function will produce the same input matrix data.
completed the double calculation path (basically add a cublasDgemm call) selectable at compile-time.

When I do that I get results like this:

FLOAT CALCULATION (CPU, GPU/CUBLAS)
(0, 0) 267569632.000000, 267569632.000000
(0, 1) 266916400.000000, 266916512.000000
(0, 2) 268765760.000000, 268765632.000000
(0, 3) 264223248.000000, 264223264.000000
(0, 4) 255403552.000000, 255403584.000000
(1, 0) 273801568.000000, 273801600.000000
(1, 1) 278523488.000000, 278523392.000000
(1, 2) 276506720.000000, 276506752.000000
(1, 3) 274823584.000000, 274823552.000000
(1, 4) 265850304.000000, 265850160.000000
(2, 0) 259909680.000000, 259909792.000000
(2, 1) 264786000.000000, 264785952.000000
(2, 2) 265651488.000000, 265651424.000000
(2, 3) 260584064.000000, 260584144.000000
(2, 4) 253351904.000000, 253351792.000000
(3, 0) 261354752.000000, 261354592.000000
(3, 1) 265288320.000000, 265288576.000000
(3, 2) 260826784.000000, 260826848.000000
(3, 3) 267221984.000000, 267221888.000000
(3, 4) 255389024.000000, 255388832.000000
(4, 0) 261693456.000000, 261693344.000000
(4, 1) 274166112.000000, 274166016.000000
(4, 2) 266335712.000000, 266335776.000000
(4, 3) 267844144.000000, 267844160.000000
(4, 4) 257772992.000000, 257773072.000000

DOUBLE CALCULATION (CPU, GPU/CUBLAS)
(0, 0) 267569561.000000, 267569561.000000
(0, 1) 266916533.000000, 266916533.000000
(0, 2) 268765654.000000, 268765654.000000
(0, 3) 264223253.000000, 264223253.000000
(0, 4) 255403590.000000, 255403590.000000
(1, 0) 273801644.000000, 273801644.000000
(1, 1) 278523489.000000, 278523489.000000
(1, 2) 276506714.000000, 276506714.000000
(1, 3) 274823532.000000, 274823532.000000
(1, 4) 265850147.000000, 265850147.000000
(2, 0) 259909803.000000, 259909803.000000
(2, 1) 264785989.000000, 264785989.000000
(2, 2) 265651427.000000, 265651427.000000
(2, 3) 260584163.000000, 260584163.000000
(2, 4) 253351797.000000, 253351797.000000
(3, 0) 261354579.000000, 261354579.000000
(3, 1) 265288529.000000, 265288529.000000
(3, 2) 260826798.000000, 260826798.000000
(3, 3) 267221847.000000, 267221847.000000
(3, 4) 255388903.000000, 255388903.000000
(4, 0) 261693373.000000, 261693373.000000
(4, 1) 274165966.000000, 274165966.000000
(4, 2) 266335782.000000, 266335782.000000
(4, 3) 267844167.000000, 267844167.000000
(4, 4) 257773075.000000, 257773075.000000

First of all I’ll just advance without proof the notion that the double calculation represents the “correct” values. This is generally supported (although not proven) by the observation that the CPU and GPU(CUBLAS) results match exactly. Given that, we can look at specific differences in the float case. I’ll just look at the first 3 lines of output. Comparing the first line of output:

FLOAT:  (0, 0) 267569632.000000, 267569632.000000
DOUBLE: (0, 0) 267569561.000000, 267569561.000000

Here we see that even in the float case, the results between CPU and GPU(CUBLAS) match. Therefore although neither one matches the “golden” double result, they are equidistant from it. Neither is more or less “accurate”. In the next line:

FLOAT:  (0, 1) 266916400.000000, 266916512.000000
DOUBLE: (0, 1) 266916533.000000, 266916533.000000

we see that the CPU (first) result is farther from the double value than the GPU (second) result. If we wish, we could say that particular float GPU result is “more accurate” than the corresponding CPU result. In the 3rd line:

FLOAT:  (0, 2) 268765760.000000, 268765632.000000
DOUBLE: (0, 2) 268765654.000000, 268765654.000000

we see the same thing: the CPU result appears to be farther away from the “golden” double calculation.

This doesn’t preclude the possibility that you may find situations in the output where the CUBLAS/GPU result is “farther away” from the “golden” value than the CPU result. I didn’t spot any but didn’t look hard. However, I don’t see any data that would support a general conclusion that CUBLAS is somehow “less accurate” than something else.

As an off-topic aside, I witnessed something I had never seen before in your code; I would not have guessed a-priori that this would have worked, but it seems to.

The gpu_check routine (in gpu_main.cu) accepts a function pointer. You are variously passing pointers to “ordinary” functions as well as those decorated with __global__. Within the gpu_check routine, you are configuring this function pointer call as follows:


int gpu_check(const char* method, void (*kernal)(const DTYPE*, const DTYPE*, DTYPE*),
              int loop = 1)
{
    ...
        kernal<<<grid, block>>>(d_a, d_b, d_tmp);

in particular, you call it with:

printf("average: %d ms\n\n", gpu_check("cublas", multiply_cublas));

where multiply_cublas (in matrix_gpu.cu) is not decorated with __global__. It’s amazing to me that seems to work, giving the desired behavior whether a kernel function pointer is passed, or an ordinary function pointer is passed (in the later case it seems that the kernel launch config is simply ignored).

Interesting/weird.

L1ttle · May 29, 2025, 4:54am

I totally understand what you mean. Even the cpu float calculation result is not accurate, right?

That’s really my fault that I regard the cpu result as the golden value.

Maybe the following result misled me:

Naive: 7 ms
(0, 0) 272445280.000000, 272445280.000000
(0, 1) 272732736.000000, 272732736.000000
(0, 2) 269240832.000000, 269240832.000000
(0, 3) 264895952.000000, 264895952.000000
(0, 4) 268468896.000000, 268468896.000000
......

Block: 7 ms
(0, 0) 272445280.000000, 272445280.000000
(0, 1) 272732736.000000, 272732736.000000
(0, 2) 269240832.000000, 269240832.000000
(0, 3) 264895952.000000, 264895952.000000
(0, 4) 268468896.000000, 268468896.000000
(1, 0) 258575440.000000, 258575440.000000
(1, 1) 259670240.000000, 259670240.000000
(1, 2) 257178240.000000, 257178240.000000
(1, 3) 259349312.000000, 259349312.000000
(1, 4) 259792512.000000, 259792512.000000
(2, 0) 274808320.000000, 274808320.000000
(2, 1) 280529408.000000, 280529408.000000
(2, 2) 277083168.000000, 277083168.000000
(2, 3) 276412352.000000, 276412352.000000
(2, 4) 281763360.000000, 281763360.000000
(3, 0) 264192432.000000, 264192432.000000
(3, 1) 265752528.000000, 265752528.000000
(3, 2) 262523520.000000, 262523520.000000
......

Shared memory: 8 ms
(0, 0) 272445280.000000, 272445280.000000
(0, 1) 272732736.000000, 272732736.000000
(0, 2) 269240832.000000, 269240832.000000
(0, 3) 264895952.000000, 264895952.000000
(0, 4) 268468896.000000, 268468896.000000
(1, 0) 258575440.000000, 258575440.000000
(1, 1) 259670240.000000, 259670240.000000
(1, 2) 257178240.000000, 257178240.000000
(1, 3) 259349312.000000, 259349312.000000
(1, 4) 259792512.000000, 259792512.000000
(2, 0) 274808320.000000, 274808320.000000
(2, 1) 280529408.000000, 280529408.000000
(2, 2) 277083168.000000, 277083168.000000
......

Results are perfectly the same.

Now, I thought it could just indicate that my gpu methods have same errors as cpu methods. : )

Oh, I didn’t notice that I call a host function by kernal<<<grid, block>>>(d_a, d_b, d_tmp);
That’s really an accident, and I really want to figure it out if possible.

system · June 12, 2025, 4:55am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Help with CUBLAS performance and timing issues, please help... CUDA Programming and Performance	1	3449	December 26, 2008
CuBLAS matrix multiplication is slower than the naive one CUDA Programming and Performance cuda	8	956	September 6, 2023
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	10018	January 18, 2012
Slow CUDA SGEMM CUDA Programming and Performance	5	673	September 15, 2022
Cuda matrix multiplication too slow CUDA Programming and Performance	5	13348	February 17, 2010
matrix multiplication CUDA Programming and Performance	10	3838	March 7, 2010
Matrix multiplication woes large inner, small outer dimensions CUDA Programming and Performance	21	10078	March 24, 2009
How to modify with matrix CUDA Programming and Performance	0	520	May 5, 2018
Matrix multiplication from CUDA programming guide CUDA Programming and Performance	0	1838	November 23, 2009
Matrix Multiplication Inconsistency Different values output in every run of the matixMul program CUDA Programming and Performance	29	8772	December 16, 2009

Matrix multiplication performance issue

Related topics