Does anyone know how to build a 2D/3D array using CudaMalloc?

can anyone here help me… i want to build a 3D array using CudaMalloc, can i? and how??
i made my own code, no error while compiling, but i got an error after i run it…
the error is : Unhandled exception at 0x1000faa5 in CUDA-Try1.exe: 0xC0000005: Access violation writing location 0x00100000.
here is my code… i just a beginner in this… any help will be appreciated… thanks :D

#include “iostream”
#include “conio.h”

#define N 10

global void add( int **a, int **b ) {
int tid = threadIdx.x;
if (tid < N) {
for (int j=0; j<N; j++) {
b[tid][j] = a[tid][j] + a[tid][j];
}
}
}

int main( void ) {
int a[N][N], b[N][N];
int **dev_a, **dev_b;

cudaMalloc( (void **)&dev_a, N*sizeof(int *));
cudaMalloc( (void **)&dev_b, N*sizeof(int *));

for(int i=0; i<N; i++) {
    cudaMalloc( (void **)&dev_a[i], N*sizeof(int));
    cudaMalloc( (void **)&dev_b[i], N*sizeof(int));
}

for (int i=0; i<N; i++) {
    for (int j=0; j<N; j++) {
        a[i][j] = i * j;
    }
}

cudaMemcpy(dev_a, a, N*N*sizeof(int *), cudaMemcpyHostToDevice);

add<<<1,N>>>( dev_a, dev_b );

cudaMemcpy( a,
            dev_a,
            (N * N * sizeof(int *)),
            cudaMemcpyDeviceToHost );
cudaMemcpy( b,
            dev_b,
            (N * N * sizeof(int *)),
            cudaMemcpyDeviceToHost );

for (int i=0; i<N; i++) {
    for (int j=0; j<N; j++) {
        printf( "a[%d][%d] + a[%d][%d] = %d + %d = %d\n", i, j, i, j, a[i][j], a[i][j], b[i][j] );
    }
}

cudaFree( dev_a );
cudaFree( dev_b );

getch();
return 0;

}

can anyone here help me… i want to build a 3D array using CudaMalloc, can i? and how??
i made my own code, no error while compiling, but i got an error after i run it…
the error is : Unhandled exception at 0x1000faa5 in CUDA-Try1.exe: 0xC0000005: Access violation writing location 0x00100000.
here is my code… i just a beginner in this… any help will be appreciated… thanks :D

#include “iostream”
#include “conio.h”

#define N 10

global void add( int **a, int **b ) {
int tid = threadIdx.x;
if (tid < N) {
for (int j=0; j<N; j++) {
b[tid][j] = a[tid][j] + a[tid][j];
}
}
}

int main( void ) {
int a[N][N], b[N][N];
int **dev_a, **dev_b;

cudaMalloc( (void **)&dev_a, N*sizeof(int *));
cudaMalloc( (void **)&dev_b, N*sizeof(int *));

for(int i=0; i<N; i++) {
    cudaMalloc( (void **)&dev_a[i], N*sizeof(int));
    cudaMalloc( (void **)&dev_b[i], N*sizeof(int));
}

for (int i=0; i<N; i++) {
    for (int j=0; j<N; j++) {
        a[i][j] = i * j;
    }
}

cudaMemcpy(dev_a, a, N*N*sizeof(int *), cudaMemcpyHostToDevice);

add<<<1,N>>>( dev_a, dev_b );

cudaMemcpy( a,
            dev_a,
            (N * N * sizeof(int *)),
            cudaMemcpyDeviceToHost );
cudaMemcpy( b,
            dev_b,
            (N * N * sizeof(int *)),
            cudaMemcpyDeviceToHost );

for (int i=0; i<N; i++) {
    for (int j=0; j<N; j++) {
        printf( "a[%d][%d] + a[%d][%d] = %d + %d = %d\n", i, j, i, j, a[i][j], a[i][j], b[i][j] );
    }
}

cudaFree( dev_a );
cudaFree( dev_b );

getch();
return 0;

}

You do not allocate multi-dimensional arrays like that in CUDA - performance will be dreadful. You allocate a 1D array big enough to hold all the data, and then compute locations into that( i.e. [font=“Courier New”]index = ix + nx*iy[/font]). This is what you should do on the CPU side too, unless you’re looking to benchmark the caching hardware on your chip.

You do not allocate multi-dimensional arrays like that in CUDA - performance will be dreadful. You allocate a 1D array big enough to hold all the data, and then compute locations into that( i.e. [font=“Courier New”]index = ix + nx*iy[/font]). This is what you should do on the CPU side too, unless you’re looking to benchmark the caching hardware on your chip.

what is ix, nx and iy stand for? and how to convert it to 3D array on the CPU side after i compute it on the GPU side then?

can i have an example of that?

sorry i’m still a beginner…

what is ix, nx and iy stand for? and how to convert it to 3D array on the CPU side after i compute it on the GPU side then?

can i have an example of that?

sorry i’m still a beginner…

The numbers ix, iy, and nx are (respectively) the desired x index, the desired y index and the number of elements in the x direction (I’m assuming that the array is stored running the x index fastest). And copying it to a 3D array on the CPU side? As I said, you should never, never have code looking like [font=“Courier New”]C[i][j][k][/font] on the CPU, unless you are specifically trying to benchmark the caching hardware.

As a getting-started example, this is a very simple Matrix class

class Matrix {

public:

// Default constructor

  Matrix( void ) : nx(0), ny(0), data(NULL) {};

// Destructor

  virtual ~Matrix( void ) {

    this->Release();

  }

// Copy constructor

  Matrix( const Matrix& src ) : nx(0),

				ny(0),

				data(NULL) {

    this->Allocate( src.nx, src.ny );

    memcpy( this->data, src.data, nx*ny*sizeof(float) );

  }

// ------------------------

  // Memory management

// Allocation

  void Allocate( const unsigned int nxVals,

		 const unsigned int nyVals )  {

    this->Release();

    this->nx = nxVals;

    this->ny = nyVals;

    this->data = (float*)_mm_malloc( nx*ny*sizeof(float), 16 );

  }

// Release

  void Release( void ) {

    if( this->data != NULL ) {

      this->nx = 0;

      this->ny = 0;

      _mm_free( this->data );

      this->data = NULL;

    }

  }

// ------------------------

  // Assignment operator

  Matrix& operator=( const Matrix& src ) {

    this->Release();

    this->Allocate( src.nx, src.ny );

    memcpy( this->data, src.data, nx*ny*sizeof(float) );

return( *this );

  }

// ------------------------

  // Access operators

  inline float operator()( const unsigned int i,

		    const unsigned int j ) const {

#ifdef CHECK_BOUNDS

    if( (i>=this->ny) || (j>=this->nx) ) {

      cerr << __FUNCTION__

	   << ": Out of bounds "

	   << i << " " << j << endl;

      cerr << "Maxes are "

	   << this->ny << " " << this->nx << endl;

      exit( EXIT_FAILURE );

    }

#endif

    return( this->data[i + j*this->ny] );

  }

inline float& operator()( const unsigned int i,

		     const unsigned int j ) {

#ifdef CHECK_BOUNDS

    if( (i>=this->ny) || (j>=this->nx) ) {

      cerr << __FUNCTION__

	   << ": Out of bounds "

	   << i << " " << j << endl;

      cerr << "Maxes are "

	   << this->ny << " " << this->nx << endl;

      exit( EXIT_FAILURE );

    }

#endif

    return( this->data[i + j*this->ny] );

  }

// ------------------------

  // Data members

  unsigned int nx, ny;

  float* data;

};

To the more experienced in C++… this was a quick example, so error checking is not as robust as it might be. The point is that it allocates the 2D matrix as a single block of memory, and overrides the parenthesis operator to allow access via 2D indices.

One can then write code such as

void MultiplySimple( Matrix& C, const Matrix&A, const Matrix& B ) {

if( A.nx != B.ny ) {

    cerr << __FUNCTION__

	 << ": Matrices not conformable" << endl;

    exit( EXIT_FAILURE );

  }

if( (C.nx!=B.nx) || (C.ny!=A.ny) ) {

    C.Allocate( B.nx, A.ny );

  }

for( unsigned int i=0; i<C.ny; i++ ) {

    for( unsigned int j=0; j<C.nx; j++ ) {

      float Cij = 0;

      for( unsigned int k=0; k<A.nx; k++ ) {

	Cij += A(i,k) * B(k,j);

      }

      C(i,j) = Cij;

    }

  }

}

And yes, that code represents an extreme example of cache-cruelty too, and is completely ignoring the carefully inserted [font=“Courier New”]_mm_malloc[/font].

The numbers ix, iy, and nx are (respectively) the desired x index, the desired y index and the number of elements in the x direction (I’m assuming that the array is stored running the x index fastest). And copying it to a 3D array on the CPU side? As I said, you should never, never have code looking like [font=“Courier New”]C[i][j][k][/font] on the CPU, unless you are specifically trying to benchmark the caching hardware.

As a getting-started example, this is a very simple Matrix class

class Matrix {

public:

// Default constructor

  Matrix( void ) : nx(0), ny(0), data(NULL) {};

// Destructor

  virtual ~Matrix( void ) {

    this->Release();

  }

// Copy constructor

  Matrix( const Matrix& src ) : nx(0),

				ny(0),

				data(NULL) {

    this->Allocate( src.nx, src.ny );

    memcpy( this->data, src.data, nx*ny*sizeof(float) );

  }

// ------------------------

  // Memory management

// Allocation

  void Allocate( const unsigned int nxVals,

		 const unsigned int nyVals )  {

    this->Release();

    this->nx = nxVals;

    this->ny = nyVals;

    this->data = (float*)_mm_malloc( nx*ny*sizeof(float), 16 );

  }

// Release

  void Release( void ) {

    if( this->data != NULL ) {

      this->nx = 0;

      this->ny = 0;

      _mm_free( this->data );

      this->data = NULL;

    }

  }

// ------------------------

  // Assignment operator

  Matrix& operator=( const Matrix& src ) {

    this->Release();

    this->Allocate( src.nx, src.ny );

    memcpy( this->data, src.data, nx*ny*sizeof(float) );

return( *this );

  }

// ------------------------

  // Access operators

  inline float operator()( const unsigned int i,

		    const unsigned int j ) const {

#ifdef CHECK_BOUNDS

    if( (i>=this->ny) || (j>=this->nx) ) {

      cerr << __FUNCTION__

	   << ": Out of bounds "

	   << i << " " << j << endl;

      cerr << "Maxes are "

	   << this->ny << " " << this->nx << endl;

      exit( EXIT_FAILURE );

    }

#endif

    return( this->data[i + j*this->ny] );

  }

inline float& operator()( const unsigned int i,

		     const unsigned int j ) {

#ifdef CHECK_BOUNDS

    if( (i>=this->ny) || (j>=this->nx) ) {

      cerr << __FUNCTION__

	   << ": Out of bounds "

	   << i << " " << j << endl;

      cerr << "Maxes are "

	   << this->ny << " " << this->nx << endl;

      exit( EXIT_FAILURE );

    }

#endif

    return( this->data[i + j*this->ny] );

  }

// ------------------------

  // Data members

  unsigned int nx, ny;

  float* data;

};

To the more experienced in C++… this was a quick example, so error checking is not as robust as it might be. The point is that it allocates the 2D matrix as a single block of memory, and overrides the parenthesis operator to allow access via 2D indices.

One can then write code such as

void MultiplySimple( Matrix& C, const Matrix&A, const Matrix& B ) {

if( A.nx != B.ny ) {

    cerr << __FUNCTION__

	 << ": Matrices not conformable" << endl;

    exit( EXIT_FAILURE );

  }

if( (C.nx!=B.nx) || (C.ny!=A.ny) ) {

    C.Allocate( B.nx, A.ny );

  }

for( unsigned int i=0; i<C.ny; i++ ) {

    for( unsigned int j=0; j<C.nx; j++ ) {

      float Cij = 0;

      for( unsigned int k=0; k<A.nx; k++ ) {

	Cij += A(i,k) * B(k,j);

      }

      C(i,j) = Cij;

    }

  }

}

And yes, that code represents an extreme example of cache-cruelty too, and is completely ignoring the carefully inserted [font=“Courier New”]_mm_malloc[/font].