I wrote this program:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
const int ARRAY_SIZE = 3;
__global__ void soberiMatricaKernel(float c[ARRAY_SIZE][ARRAY_SIZE], float a[ARRAY_SIZE][ARRAY_SIZE], float b[ARRAY_SIZE][ARRAY_SIZE])
{
int i = threadIdx.x;
int j = threadIdx.y;
c[i][j] = a[i][j] + b[i][j];
}
int main()
{
float d_a[ARRAY_SIZE][ARRAY_SIZE] = { {1, 2, 3}, {4, 5, 6}, {7, 8, 9} };
float d_b[ARRAY_SIZE][ARRAY_SIZE] = { {9, 8, 7}, {6, 5, 4}, {3, 2, 1} };
float d_c[ARRAY_SIZE][ARRAY_SIZE];
cudaMalloc((void**)&d_a, ARRAY_SIZE * ARRAY_SIZE * sizeof(float));
cudaMalloc((void**)&d_b, ARRAY_SIZE * ARRAY_SIZE * sizeof(float));
cudaMalloc((void**)&d_c, ARRAY_SIZE * ARRAY_SIZE * sizeof(float));
soberiMatricaKernel <<<1, ARRAY_SIZE * ARRAY_SIZE >>> (d_c, d_a, d_b);
for (int i = 0; i < ARRAY_SIZE; i += 1)
{
for (int j = 0; j < ARRAY_SIZE; j += 1)
{
printf("%d\n", d_c[i][j]);
}
}
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
Although it compiles with no error, the execution doesn’t get inside my soberiMatricaKernel() CUDA function. Can somebody spot what do I do wrong?
Should I use cudaMemcpy() before soberiMatricaKernel() ? How it will look like?