Working with matrix

Hi, im new to CUDA and i’m trying to make something with matrix.

The problem is that following the Nvidia programming book for cuda I cant make this simple program working.

(It allocs memory for host and device, assign with kernel value 0 to a matrix and copy matrix into host memory space)

#include<stdio.h>

__global__ void lol(int *mat, size_t pitch);

int main(){

	 int *d_mat = 0,

	 **h_mat;

	 

	 size_t pitch; 

	 int x = 4,

	 y = 4,

	 i, j,

	 tmp;

	 dim3 block(4,4);

	 // alloc host matrix

	 h_mat = (int **) malloc(x * sizeof(int *));

	 for(i = 0; i < y; i++)

	  h_mat[i] = (int *) malloc(4 * sizeof(int));

	 

	 //alloc device matrix

	 cudaMallocPitch((void**)&h_mat, &pitch, x * sizeof(int), y);

	 lol<<<1, block>>>(d_mat, pitch);

	 for(i = 0; i < x; i++){

	  int *row = (int *) ((char*)d_mat + i * pitch);

	  for(j = 0; j < y; j++){

		   cudaMemcpy(&tmp, &row[j], sizeof(int), cudaMemcpyDeviceToHost);

		   printf("[%d][%d]: %d\n",i,j,tmp);

	  }

	 }

}

__global__ void lol(int *mat, size_t pitch){

	 int x = threadIdx.x;

	 int y = threadIdx.y;

	 

	 int *row = (int *) ((char*)mat + x * pitch);

	 row[y] = 0;

}

Any tips?

Thanks :D

Edit:

solved, it was only a stupid mistakes