Hi, im new to CUDA and i’m trying to make something with matrix.
The problem is that following the Nvidia programming book for cuda I cant make this simple program working.
(It allocs memory for host and device, assign with kernel value 0 to a matrix and copy matrix into host memory space)
#include<stdio.h>
__global__ void lol(int *mat, size_t pitch);
int main(){
int *d_mat = 0,
**h_mat;
size_t pitch;
int x = 4,
y = 4,
i, j,
tmp;
dim3 block(4,4);
// alloc host matrix
h_mat = (int **) malloc(x * sizeof(int *));
for(i = 0; i < y; i++)
h_mat[i] = (int *) malloc(4 * sizeof(int));
//alloc device matrix
cudaMallocPitch((void**)&h_mat, &pitch, x * sizeof(int), y);
lol<<<1, block>>>(d_mat, pitch);
for(i = 0; i < x; i++){
int *row = (int *) ((char*)d_mat + i * pitch);
for(j = 0; j < y; j++){
cudaMemcpy(&tmp, &row[j], sizeof(int), cudaMemcpyDeviceToHost);
printf("[%d][%d]: %d\n",i,j,tmp);
}
}
}
__global__ void lol(int *mat, size_t pitch){
int x = threadIdx.x;
int y = threadIdx.y;
int *row = (int *) ((char*)mat + x * pitch);
row[y] = 0;
}
Any tips?
Thanks :D
Edit:
solved, it was only a stupid mistakes