What do I do wrong in my simple "sum two 3x3 matrices" program?

I wrote this program:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
const int ARRAY_SIZE = 3;

__global__ void soberiMatricaKernel(float c[ARRAY_SIZE][ARRAY_SIZE], float a[ARRAY_SIZE][ARRAY_SIZE], float b[ARRAY_SIZE][ARRAY_SIZE])
{
	int i = threadIdx.x;
	int j = threadIdx.y;

	c[i][j] = a[i][j] + b[i][j];
}

int main()
{
	
	float d_a[ARRAY_SIZE][ARRAY_SIZE] = { {1, 2, 3}, {4, 5, 6}, {7, 8, 9} };
	float d_b[ARRAY_SIZE][ARRAY_SIZE] = { {9, 8, 7}, {6, 5, 4}, {3, 2, 1} };
	float d_c[ARRAY_SIZE][ARRAY_SIZE];

	cudaMalloc((void**)&d_a, ARRAY_SIZE * ARRAY_SIZE * sizeof(float));
	cudaMalloc((void**)&d_b, ARRAY_SIZE * ARRAY_SIZE * sizeof(float));
	cudaMalloc((void**)&d_c, ARRAY_SIZE * ARRAY_SIZE * sizeof(float));

	soberiMatricaKernel <<<1, ARRAY_SIZE * ARRAY_SIZE >>> (d_c, d_a, d_b);

	for (int i = 0; i < ARRAY_SIZE; i += 1)
	{
		for (int j = 0; j < ARRAY_SIZE; j += 1)
		{
			printf("%d\n", d_c[i][j]);
		}
	}

	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);

	return 0;
}

Although it compiles with no error, the execution doesn’t get inside my soberiMatricaKernel() CUDA function. Can somebody spot what do I do wrong?

Should I use cudaMemcpy() before soberiMatricaKernel() ? How it will look like?

You’ve made several errors. Study the cuda vectorAdd sample code. Follow a similar sequence.