A simple add vector (2D) doesn't show correct output (garbage output)

I can’t figure where is the mistake.
I am receiving garbage as c_out output.

#include <stdio.h>

#define NUM_THREADS 4
#define ARRAY_SIZE  2
#define NUM_BLOCKS  1


__global__ void as2D(int  a_in[ARRAY_SIZE][ARRAY_SIZE],int b_in[ARRAY_SIZE][ARRAY_SIZE],int c_out[ARRAY_SIZE][ARRAY_SIZE]){
    
    int row = blockIdx.y * blockDim.y + threadIdx.y; 
    int col = blockIdx.x * blockDim.x + threadIdx.x; 
    
    
      
    if (row<ARRAY_SIZE && col<ARRAY_SIZE)
      c_out[row][col]=a_in[row][col]+b_in[row][col];
  
}


__global__ void as1D(int  a_in[ARRAY_SIZE],int b_in[ARRAY_SIZE],int c_out[ARRAY_SIZE]){
  
    int row = blockIdx.y * blockDim.y + threadIdx.y; 
    int col = blockIdx.x * blockDim.x + threadIdx.x; 
    
    if ((row<ARRAY_SIZE) && (col<ARRAY_SIZE))
      
      c_out[row*ARRAY_SIZE+col]=a_in[row*ARRAY_SIZE+col]+b_in[row*ARRAY_SIZE+col];
  
}

int main(int argc,char **argv)
{   
   
    printf("%d total threads in %d blocks writing into %d array elements\n",
           NUM_THREADS, NUM_BLOCKS, ARRAY_SIZE);

    // declare and allocate host memory
    int a_in[ARRAY_SIZE],b_in[ARRAY_SIZE],c_out[ARRAY_SIZE];
    const int ARRAY_BYTES = ARRAY_SIZE *ARRAY_SIZE*sizeof(int);
   
    //int a_in[ARRAY_SIZE][ARRAY_SIZE],b_in[ARRAY_SIZE][ARRAY_SIZE],c_out[ARRAY_SIZE][ARRAY_SIZE];
   // const int ARRAY_BYTES = ARRAY_SIZE *ARRAY_SIZE*sizeof(int);
    
 
    // declare, allocate, and zero out GPU memory

    int *dev_a,*dev_b,*dev_c;
  
    cudaMalloc((void **) &dev_a, ARRAY_BYTES);
    cudaMemset((void *) dev_a, 0, ARRAY_BYTES); 
    cudaMalloc((void **) &dev_b, ARRAY_BYTES);
    cudaMemset((void *) dev_b, 0, ARRAY_BYTES); 
    cudaMalloc((void **) &dev_c, ARRAY_BYTES);
    cudaMemset((void *) dev_c, 0, ARRAY_BYTES); 
   
 /*   //fill in the arrays a,b on CPU
   for (int i=0;i<ARRAY_SIZE;i++){
	for (int j=0;j<ARRAY_SIZE;j++){
	  a_in[i][j]=1;
	  b_in[i][j]=2;
    
     }
   }
    //display g
    for (int i=0;i<ARRAY_SIZE;i++){
       for (int j=0;j<ARRAY_SIZE;j++){
      printf("a[%d][%d]= %d \t b[%d][%d]= %d \n",i,j,a_in[i][j],i,j,b_in[i][j]);
    
       }
       printf("\n");
     }
     */
 
 for (int i=0;i<ARRAY_SIZE;i++){
   for (int j=0;j<ARRAY_SIZE;j++){
    a_in[i*ARRAY_SIZE+j]=1;
    b_in[i*ARRAY_SIZE+j]=2;
   
  }
 
 }
 
 for (int i=0;i<ARRAY_SIZE;i++){
       for (int j=0;j<ARRAY_SIZE;j++){
      printf("a[%d][%d]= %d \t b[%d][%d]= %d \n",i,j,a_in[i*ARRAY_SIZE+j],i,j,b_in[i*ARRAY_SIZE+j]);
    
       }
       printf("\n");
     }
    // copy back the arrays a,b to the GPU
    cudaMemcpy(dev_a, a_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
   
      as1D<<<NUM_BLOCKS,NUM_THREADS >>>(dev_a,dev_b,dev_c);
  
    
    // copy back the array c from GPU to CPU
    cudaMemcpy(c_out,dev_c, ARRAY_BYTES, cudaMemcpyDeviceToHost);
    

   for (int i=0;i<ARRAY_SIZE;i++){
      for (int j=0;j<ARRAY_SIZE;j++){
    
    
   printf("c[%d][%d] = %d\t",i,j,c_out[i*ARRAY_SIZE+j]);
   
      }
      printf("\n");
     
}


 
    // free GPU memory allocation and exit
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);
   
    return 0;
}

your launch configuration is the problem, you are only launching for x-dimension. There may be other problems but a first glance that is one of them.

Hmm…,
I tried :

dim3 dimBlock(NUM_BLOCKS, NUM_BLOCKS);
  dim3 dimGrid(NUM_THREADS , NUM_THREADS );
  
  as1D<<<dimGrid,dimBlock >>>(dev_a,dev_b,dev_c);

but still the same!

#include <algorithm>
#include <iostream>
#include <fstream>
#include <sstream>
#include <utility>
#include <cassert>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <string>
#include <cmath>
#include <ctime>
#include <cuda.h>
//#include <cufft.h>
#include <math_functions.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <MMSystem.h>
#pragma comment(lib, "winmm.lib")
#define _CRTDBG_MAP_ALLOC
#include <crtdbg.h>
using namespace std;

#define _DTH cudaMemcpyDeviceToHost
#define _DTD cudaMemcpyDeviceToDevice
#define _HTD cudaMemcpyHostToDevice
#define THREADS 64

bool InitMMTimer(UINT wTimerRes);
void DestroyMMTimer(UINT wTimerRes, bool init);

__global__ void add_matrix(const float* __restrict__ a, const float* __restrict__ b, float* __restrict__ c, const int sz){
	const int x=threadIdx.x+blockDim.x*blockIdx.x;
	const int y=blockIdx.y;
	if(x<sz && y<sz){
		c[y*sz+x]=a[y*sz+x]+b[y*sz+x];
	}
}

int main(){
	char ch;
	srand(time(NULL));
	const int sz=16;
	const int num_bytes=sz*sz*sizeof(float);
	float *host_a=(float *) malloc(num_bytes);
	float *host_b=(float *) malloc(num_bytes);
	float *host_c=(float *) malloc(num_bytes);
	memset(host_c,0,num_bytes);
	for(int i=0;i<sz;i++)for(int j=0;j<sz;j++){
		host_a[i*sz+j]=0.5f;
		host_b[i*sz+j]=0.25f;
	}
	dim3 grid((sz+THREADS-1)/THREADS,sz,1);

	float *d_a,*d_b,*d_c;
	cudaError_t err=cudaMalloc((void**)&d_a,num_bytes);
	err=cudaMalloc((void**)&d_b,num_bytes);
	err=cudaMalloc((void**)&d_c,num_bytes);
	err=cudaMemcpy(d_a,host_a,num_bytes,cudaMemcpyHostToDevice);
	err=cudaMemcpy(d_b,host_b,num_bytes,cudaMemcpyHostToDevice);
	err=cudaMemset(d_c,0,num_bytes);

	add_matrix<<<grid,THREADS>>>(d_a,d_b,d_c,sz);

	err=cudaMemcpy(host_c,d_c,num_bytes,cudaMemcpyDeviceToHost);

	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);

	for(int i=0;i<sz;i++){
		cout<<'\n';
		for(int j=0;j<sz;j++){
			cout<<host_c[i*sz+j]<<' ';

		}
	}

	free(host_a);
	free(host_b);
	free(host_c);
	cin>>ch;
	return 0;
}

//Definitions

bool InitMMTimer(UINT wTimerRes){
	TIMECAPS tc;
	if (timeGetDevCaps(&tc, sizeof(TIMECAPS)) != TIMERR_NOERROR) {return false;}
	wTimerRes = min(max(tc.wPeriodMin, 1), tc.wPeriodMax);
	timeBeginPeriod(wTimerRes); 
	return true;
}

void DestroyMMTimer(UINT wTimerRes, bool init){
	if(init)
		timeEndPeriod(wTimerRes);
}

Is that what you had in mind? Some of your terms are confusing. This output the addition of two matrices.

Also y will only range between 0 and sz-1, so checking if y<sz is not actually needed.

Yes ,that is that I want to do!

The only thing different I can notice is that you use

const int sz=16;
const int num_bytes=sz*sz*sizeof(float);
f(x<sz && y<sz){

and I use only

if ((row<ARRAY_SIZE) && (col<ARRAY_SIZE))
 const int ARRAY_BYTES = ARRAY_SIZE *ARRAY_SIZE*sizeof(int);

but I don’t think this is the cause…

Well my posted code should work, but it is not the most efficient way of adding matrices. Use cuBLAS for this if you need speed.

The problem is that I want my code to work …And I can’t figure the error!

I found 2 errors but still I have the problem!

Changed this:

__global__ void as1D(int a_in[ARRAY_SIZE],int b_in[ARRAY_SIZE],int c_out[ARRAY_SIZE]){

to this:

__global__ void as1D(int  a_in[ARRAY_SIZE*ARRAY_SIZE],int b_in[ARRAY_SIZE*ARRAY_SIZE],int c_out[ARRAY_SIZE*ARRAY_SIZE]){

and accordingly :

int a_in[ARRAY_SIZE*ARRAY_SIZE],b_in[ARRAY_SIZE*ARRAY_SIZE],c_out[ARRAY_SIZE*ARRAY_SIZE];

but still c_out shows garbage…

I am not sure what is happening but now is running fine!
I tried ti run it through nsight and fro that moment it runs fine!

Why are using full arrays as a kernel parameter?

look at how I did mine, and just use pointers with a seperate integer parameter of the size. Use const if you are not going to change the inputs.

Again, look at how I (and everybody else) does it, instead of forcing this fixed size array kernel parameter.

I it wrong if I use

global(int a[N]… instead of int *a ?

I can’t understand.

Thanks!

You need to use simple C style pointers to memory. If it is a 2-D or 3-D array just use the same indexing used above.

Just look at my example, that is standard practice. Do not worry about the’ restrict’ cast, just notice that arrays are represented as pointers. Use const when appropriate to make things clear.

Ok,thank you!