hello all,
i’m new to cuda and i want to create a matrix-multiplication-program
when i launch it, it doesn’t print the expected result
for example with this configuration it prints:
1 8
26 8
and if i run with a other configuration and rerun with this it prints something other
th only configuration witch prints the correct result is: dim 4 and thread 4
14 20 26 32
20 30 40 50
26 40 54 68
32 50 68 86
do you have any idea where my mistake is?
thanks in advance for any help
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "book.h"
#define dim 2
#define thread 1
#define ax dim
#define ay dim
#define bx dim
#define by dim
__global__ void mamu(int *a, int *b, int *c) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (((x+1)*(y+1))<dim) {
int sum = 0;
for (int i = 0; i < ay; i++) {
sum += a[x * ay + i] * b[i * by + y];
}
c[x * ax + y] = sum;
}
}
void fill(int *a, int x, int y) {
for (int i = 0; i < x; i++) {
for (int j = 0; j < y; j++) {
a[i * y + j] = i + j;
}
}
}
void print(int *a, int x, int y) {
for (int i = 0; i < x; i++) {
for (int j = 0; j < y; j++) {
printf("%d\t", a[i * y + j]);
}
printf("\n");
}
printf("\n");
}
int main(void) {
int *a, *b, *c;
int *dev_a, *dev_b, *dev_c;
cudaEvent_t start, stop;
float elapsedTime;
a = (int*) malloc(dim * dim * sizeof(int));
b = (int*) malloc(dim * dim * sizeof(int));
c = (int*) malloc(dim * dim * sizeof(int));
HANDLE_ERROR(cudaMalloc((void**) &dev_a, dim * dim * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**) &dev_b, dim * dim * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**) &dev_c, dim * dim * sizeof(int)));
long long t = time(NULL);
fill(a, ax, ay);
fill(b, bx, by);
printf("time to fill: %lld s\n", time(NULL) - t);
//print( a, ax, ay);
//print( b, bx, by);
HANDLE_ERROR(cudaMemcpy(dev_a, a, dim * dim * sizeof(int),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_b, b, dim * dim * sizeof(int),cudaMemcpyHostToDevice));
dim3 blocks((dim+thread-1) / thread, (dim+thread-1) / thread);
dim3 threads(thread, thread);
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, 0));
mamu<<<blocks,threads>>>(dev_a, dev_b, dev_c);
printf("done!\n");
HANDLE_ERROR(cudaEventRecord(stop, 0));
HANDLE_ERROR(cudaEventSynchronize(stop));
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
printf("time: %3.1f ms\n", elapsedTime);
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaMemcpy(c, dev_c, dim * dim * sizeof(int),cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaFree(dev_c));
print( c, ax, by);
}