# What do I do wrong in my simple "sum two 3x3 matrices" program?

I wrote this program:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
const int ARRAY_SIZE = 3;

__global__ void soberiMatricaKernel(float c[ARRAY_SIZE][ARRAY_SIZE], float a[ARRAY_SIZE][ARRAY_SIZE], float b[ARRAY_SIZE][ARRAY_SIZE])
{

c[i][j] = a[i][j] + b[i][j];
}

int main()
{

float d_a[ARRAY_SIZE][ARRAY_SIZE] = { {1, 2, 3}, {4, 5, 6}, {7, 8, 9} };
float d_b[ARRAY_SIZE][ARRAY_SIZE] = { {9, 8, 7}, {6, 5, 4}, {3, 2, 1} };
float d_c[ARRAY_SIZE][ARRAY_SIZE];

cudaMalloc((void**)&d_a, ARRAY_SIZE * ARRAY_SIZE * sizeof(float));
cudaMalloc((void**)&d_b, ARRAY_SIZE * ARRAY_SIZE * sizeof(float));
cudaMalloc((void**)&d_c, ARRAY_SIZE * ARRAY_SIZE * sizeof(float));

soberiMatricaKernel <<<1, ARRAY_SIZE * ARRAY_SIZE >>> (d_c, d_a, d_b);

for (int i = 0; i < ARRAY_SIZE; i += 1)
{
for (int j = 0; j < ARRAY_SIZE; j += 1)
{
printf("%d\n", d_c[i][j]);
}
}

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

return 0;
}

Although it compiles with no error, the execution doesn’t get inside my soberiMatricaKernel() CUDA function. Can somebody spot what do I do wrong?

Should I use cudaMemcpy() before soberiMatricaKernel() ? How it will look like?