3x3 Matrix multiply giving incorrect result

The following matrix multiplication is not working properly.

Output

user@server:~/$ ./test.bin
1 2 3
4 5 6
7 8 9
1 4 7
2 5 8
3 6 9

threads_per_block 32 * 8 * 4 = 1024

blocks_per_grid 1 * 1 * 1 = 1
30 37 54
81 94 117
150 129 81
File written successfully.
user@server:~/$

The expected result was:

 30  36  42 
 66  81  96
102 126 150

How can I fix it?

Source code

////   test.cu   ////
#include "common.hpp"

const int min_ = 0;
const int max_ = 10;

__global__ void MultiplyMatKernel(I* A, I* B, I* C, int N)
{
    int dimx = N;
	int dimy = N;
	int dimz = N;

    int r = blockIdx.x * blockDim.x + threadIdx.x;
    int c = blockIdx.y * blockDim.y + threadIdx.y;
	int d = blockIdx.z * blockDim.z + threadIdx.z;

    if (r < N && c < N && d < N) 
	{
        int loc_c = d * dimx * dimy + c * dimx + r;
		int loc_a = d * dimx * dimy + c * dimx + r;
		int loc_b = d * dimx * dimy + c * dimx + r;
        for (int cc=0; cc<N; cc++) 
		{	
            C[loc_c] += A[loc_a+cc]*B[loc_b+cc];
        }
		//printf("C[%d]=%d  \n", loc_c, C[loc_c]);
    }
}

void Transpose(I *A, I**At, int N)
{
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            // copy the value at (i,j) to (j,i) in At
            (*At)[j*N + i] = A[i*N + j];
        }
    }
}

void Print(I*A, int N)
{
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            printf("%d ", A[i*N + j]);
        }
		printf("\n");
    }
}

int main()
{    
    I * host_a;
    I * host_b;
	I * host_b_T;
    I * host_c;
    I * device_a;
    I * device_b;
    I * device_c;
	int kernel_len;
	int length;
    dim3 threads_per_block;
    dim3 blocks_per_grid;    
    
	kernel_len = 3;
    length = kernel_len * kernel_len * 1;
    host_a = (I *) malloc(sizeof(I) * length);
    host_b = (I *) malloc(sizeof(I) * length);
	host_b_T = (I *) malloc(sizeof(I) * length);
    host_c = (I *) malloc(sizeof(I) * length);
    
    if (host_a == nullptr || host_b == nullptr || host_c == nullptr)
    {
        std::cerr << "Error: Memory allocation for host arrays failed." << std::endl;
        exit(1);
    }

    CHECK_CUDA_ERROR(cudaMalloc((void**) &device_a, sizeof(I) * length));
    CHECK_CUDA_ERROR(cudaMalloc((void**) &device_b, sizeof(I) * length));
    CHECK_CUDA_ERROR(cudaMalloc((void**) &device_c, sizeof(I) * length));

    for (int i = 0; i < length ; ++i) 
    {
        host_a[i] = i+1;
        host_b[i] = i+1;
        host_c[i] = 0;
    }

	Transpose(host_b, &host_b_T, kernel_len);

    Print(host_a, kernel_len);
    Print(host_b_T, kernel_len);

    int dimx = kernel_len;
    int dimy = kernel_len;
    int dimz = 1;

    //int max_thread = 1024;
    threads_per_block = dim3(32, 8, 4); // because, 1204 = 32*8*4 
    blocks_per_grid = dim3((dimx + threads_per_block.x - 1) / threads_per_block.x, 
	                       (dimy + threads_per_block.y - 1) / threads_per_block.y, 
						   (dimz + threads_per_block.z - 1) / threads_per_block.z);

    print_dim3("threads_per_block", threads_per_block);
    print_dim3("blocks_per_grid", blocks_per_grid);

    CHECK_CUDA_ERROR(cudaMemcpy(device_a, host_a, sizeof(I) * length, cudaMemcpyHostToDevice));
    CHECK_CUDA_ERROR(cudaMemcpy(device_b, host_b_T, sizeof(I) * length, cudaMemcpyHostToDevice));
    
    MultiplyMatKernel<<<blocks_per_grid, threads_per_block>>>(device_a, device_b, device_c, kernel_len);
	
    CHECK_LAST_CUDA_ERROR();	
    CHECK_CUDA_ERROR(cudaDeviceSynchronize());	
    CHECK_CUDA_ERROR(cudaMemcpy(host_c, device_c, sizeof(I) * length, cudaMemcpyDeviceToHost));
	
	Print(host_c, kernel_len);
	
    write_output_to_file(host_a, host_b, host_c, "output.txt", length);
	
    cudaFree(device_a);
    cudaFree(device_b);
    cudaFree(device_c);
	
    free(host_a);
    free(host_b);
    free(host_c);
}

Your source code is incomplete. By itself, it cannot be compiled.

Your kernel code is doing illegal/out-of-bounds indexing, which you can discover using the methodology covered here.

You are not initializing the C device array before adding values to it.

Your method of calculating the the matrix multiply also doesn’t make any sense.

Finally, your expected result doesn’t make any sense. The product of the first row of A and the first column of B results in a value of (1x1)+(2x2)+(3x3) which is 14, not 30.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.