I can’t figure where is the mistake.
I am receiving garbage as c_out output.
#include <stdio.h>
#define NUM_THREADS 4
#define ARRAY_SIZE 2
#define NUM_BLOCKS 1
__global__ void as2D(int a_in[ARRAY_SIZE][ARRAY_SIZE],int b_in[ARRAY_SIZE][ARRAY_SIZE],int c_out[ARRAY_SIZE][ARRAY_SIZE]){
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row<ARRAY_SIZE && col<ARRAY_SIZE)
c_out[row][col]=a_in[row][col]+b_in[row][col];
}
__global__ void as1D(int a_in[ARRAY_SIZE],int b_in[ARRAY_SIZE],int c_out[ARRAY_SIZE]){
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if ((row<ARRAY_SIZE) && (col<ARRAY_SIZE))
c_out[row*ARRAY_SIZE+col]=a_in[row*ARRAY_SIZE+col]+b_in[row*ARRAY_SIZE+col];
}
int main(int argc,char **argv)
{
printf("%d total threads in %d blocks writing into %d array elements\n",
NUM_THREADS, NUM_BLOCKS, ARRAY_SIZE);
// declare and allocate host memory
int a_in[ARRAY_SIZE],b_in[ARRAY_SIZE],c_out[ARRAY_SIZE];
const int ARRAY_BYTES = ARRAY_SIZE *ARRAY_SIZE*sizeof(int);
//int a_in[ARRAY_SIZE][ARRAY_SIZE],b_in[ARRAY_SIZE][ARRAY_SIZE],c_out[ARRAY_SIZE][ARRAY_SIZE];
// const int ARRAY_BYTES = ARRAY_SIZE *ARRAY_SIZE*sizeof(int);
// declare, allocate, and zero out GPU memory
int *dev_a,*dev_b,*dev_c;
cudaMalloc((void **) &dev_a, ARRAY_BYTES);
cudaMemset((void *) dev_a, 0, ARRAY_BYTES);
cudaMalloc((void **) &dev_b, ARRAY_BYTES);
cudaMemset((void *) dev_b, 0, ARRAY_BYTES);
cudaMalloc((void **) &dev_c, ARRAY_BYTES);
cudaMemset((void *) dev_c, 0, ARRAY_BYTES);
/* //fill in the arrays a,b on CPU
for (int i=0;i<ARRAY_SIZE;i++){
for (int j=0;j<ARRAY_SIZE;j++){
a_in[i][j]=1;
b_in[i][j]=2;
}
}
//display g
for (int i=0;i<ARRAY_SIZE;i++){
for (int j=0;j<ARRAY_SIZE;j++){
printf("a[%d][%d]= %d \t b[%d][%d]= %d \n",i,j,a_in[i][j],i,j,b_in[i][j]);
}
printf("\n");
}
*/
for (int i=0;i<ARRAY_SIZE;i++){
for (int j=0;j<ARRAY_SIZE;j++){
a_in[i*ARRAY_SIZE+j]=1;
b_in[i*ARRAY_SIZE+j]=2;
}
}
for (int i=0;i<ARRAY_SIZE;i++){
for (int j=0;j<ARRAY_SIZE;j++){
printf("a[%d][%d]= %d \t b[%d][%d]= %d \n",i,j,a_in[i*ARRAY_SIZE+j],i,j,b_in[i*ARRAY_SIZE+j]);
}
printf("\n");
}
// copy back the arrays a,b to the GPU
cudaMemcpy(dev_a, a_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
as1D<<<NUM_BLOCKS,NUM_THREADS >>>(dev_a,dev_b,dev_c);
// copy back the array c from GPU to CPU
cudaMemcpy(c_out,dev_c, ARRAY_BYTES, cudaMemcpyDeviceToHost);
for (int i=0;i<ARRAY_SIZE;i++){
for (int j=0;j<ARRAY_SIZE;j++){
printf("c[%d][%d] = %d\t",i,j,c_out[i*ARRAY_SIZE+j]);
}
printf("\n");
}
// free GPU memory allocation and exit
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}