simple matrixmultiplication

hello all,

i’m new to cuda and i want to create a matrix-multiplication-program

when i launch it, it doesn’t print the expected result

for example with this configuration it prints:

1 8

26 8

and if i run with a other configuration and rerun with this it prints something other

th only configuration witch prints the correct result is: dim 4 and thread 4

14 20 26 32

20 30 40 50

26 40 54 68

32 50 68 86

do you have any idea where my mistake is?

thanks in advance for any help

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

#include "book.h"

#define dim 2

#define thread 1

#define ax dim

#define ay dim

#define bx dim

#define by dim

__global__ void mamu(int *a, int *b, int *c) {

	int x = blockIdx.x * blockDim.x + threadIdx.x;

	int y = blockIdx.y * blockDim.y + threadIdx.y;

	if (((x+1)*(y+1))<dim) {

		int sum = 0;

		for (int i = 0; i < ay; i++) {

			sum += a[x * ay + i] * b[i * by + y];

		}

		c[x * ax + y] = sum;

	}

}

void fill(int *a, int x, int y) {

	for (int i = 0; i < x; i++) {

		for (int j = 0; j < y; j++) {

			a[i * y + j] = i + j;

		}

	}

}

void print(int *a, int x, int y) {

	for (int i = 0; i < x; i++) {

		for (int j = 0; j < y; j++) {

			printf("%d\t", a[i * y + j]);

		}

		printf("\n");

	}

	printf("\n");

}

int main(void) {

	int *a, *b, *c;

	int *dev_a, *dev_b, *dev_c;

	cudaEvent_t start, stop;

	float elapsedTime;

	a = (int*) malloc(dim * dim * sizeof(int));

	b = (int*) malloc(dim * dim * sizeof(int));

	c = (int*) malloc(dim * dim * sizeof(int));

	HANDLE_ERROR(cudaMalloc((void**) &dev_a, dim * dim * sizeof(int)));

	HANDLE_ERROR(cudaMalloc((void**) &dev_b, dim * dim * sizeof(int)));

	HANDLE_ERROR(cudaMalloc((void**) &dev_c, dim * dim * sizeof(int)));

	long long t = time(NULL);

	fill(a, ax, ay);

	fill(b, bx, by);

	printf("time to fill: %lld s\n", time(NULL) - t);

	//print( a, ax, ay);

	//print( b, bx, by);

	HANDLE_ERROR(cudaMemcpy(dev_a, a, dim * dim * sizeof(int),cudaMemcpyHostToDevice));

	HANDLE_ERROR(cudaMemcpy(dev_b, b, dim * dim * sizeof(int),cudaMemcpyHostToDevice));

	dim3	blocks((dim+thread-1) / thread, (dim+thread-1) / thread);

	dim3	threads(thread, thread);

	HANDLE_ERROR(cudaEventCreate(&start));

	HANDLE_ERROR(cudaEventCreate(&stop));

	HANDLE_ERROR(cudaEventRecord(start, 0));

	mamu<<<blocks,threads>>>(dev_a, dev_b, dev_c);

	

	printf("done!\n");

	HANDLE_ERROR(cudaEventRecord(stop, 0));

	HANDLE_ERROR(cudaEventSynchronize(stop));

	HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));

	printf("time:  %3.1f ms\n", elapsedTime);

	HANDLE_ERROR(cudaEventDestroy(start));

	HANDLE_ERROR(cudaEventDestroy(stop));

	HANDLE_ERROR(cudaMemcpy(c, dev_c, dim * dim * sizeof(int),cudaMemcpyDeviceToHost));

	HANDLE_ERROR(cudaFree(dev_c));

	print( c, ax, by);

}

I haven’t read any further than this line, but

if (((x+1)*(y+1))<dim) {

is probably meant to be

if ((x<dim) && (y<dim)) {