Passing a multidimensional array to kernel how to allocate space in host and pass to device?

In chapter 2.2 of the CUDA programming guide, there is an incomplete example of using multidimensional arrays. Here is the code:

[codebox]// Kernel definition

global void MatAdd(float A[N][N], float B[N][N],

                   float C[N][N])

{

int i = threadIdx.x;

int j = threadIdx.y;

C[i][j] = A[i][j] + B[i][j];

}

int main()

{

// Kernel invocation

dim3 dimBlock(N, N);

MatAdd<<<1, dimBlock>>>(A, B, C);

}[/codebox]

how do I declare and allocate the space for A, B, and C in the host using cudaMalloc?

Well, there it works because you’re passing type float[N][N], where N is #define’d and therefore known at compile time. As a result those “2D” accesses will actually become 1D.

If you want to do multidimensional arrays sized at runtime, it’s a much bigger pain. Let me try to find the post where I explained all of this in excruciating detail…

edit: http://forums.nvidia.com/index.php?s=&…st&p=445325 a 2D array is basically a special case of an array of structs containing other pointers.

I understand that N is known at compile time, but for the purposes of my understanding the CUDA language, how do I fix the code so it compiles? I do not understand how to declare/allocate a float[N][N], copy data from host to this array, and run the kernel.

well, it looks something like this:

#include <stdlib.h>

#include <stdio.h>

#define N 16

// Kernel definition

__global__ void MatAdd(float A[N][N], float B[N][N],

					   float C[N][N])

{

	int i = threadIdx.x;

	int j = threadIdx.y;

	C[i][j] = A[i][j] + B[i][j];

}

int main()

{

  float *A, *B, *C;

  cudaMalloc((void**)&A, sizeof(float) * N * N);

  cudaMalloc((void**)&B, sizeof(float) * N * N);

  cudaMalloc((void**)&C, sizeof(float) * N * N);

  // Kernel invocation

  dim3 dimBlock(N, N);

  MatAdd<<<1, dimBlock>>>((float(*) [16])A, (float(*) [16])B, (float(*)[16])C);

  if (cudaGetLastError() != cudaSuccess)

	printf("kernel launch failed\n");

  cudaThreadSynchronize();

  if (cudaGetLastError() != cudaSuccess)

	printf("kernel execution failed\n");

}

note that I’m completely ignoring cudaMemcpy here or host-side allocations, and dereferencing A, B or C from the host code will cause a segfault (since they’re GPU pointers, not CPU pointers).

The method works. Thank you!

I am continuing from this thread because it seems related to passing in a multi-d array to a kernel AND using it. My question is on using the multi-d array in the kernel, how do i do it? Consider the following code, is this what I should be doing if I have for loops inside the kernel?

// Kernel definition

__global__ void MatAdd(float A[N][N], float B[N][N],

					   float C[N][N])

{

	int tidi = threadIdx.x;

	int tidj = threadIdx.y;

	const int THREAD_N = blockDim.x * gridDim.x;	

for(int i=tidi; i< N; i +=THREAD_N)

		 for(int j=tidj; j< N; j +=THREAD_N)

			   C[i][j] = A[i][j] + B[i][j];

}

please correct me as my understanding of CUDA is very limited.

Thank you

No, that is not correct. You can access multidimensional arrays in CUDA the same way you do in C. so instead of adding THREAD_N to i and j each iteration, you only have to add one. For example, the first element is A[0][0], the next element is A[0][1].

THREAD_N is the same as N. So by adding THREAD_N to tidi and tidj, you are going out of bounds on your array.

I apologize if this doesn’t answer your question. I’m having difficulty understanding your code. What are you trying to make it to do?

I guess what I am trying to do is to take some 2-d arrays and do some adding and inserting of data at specific array elements

Currently I have this kernel:

// kernel that adds noise and initialize other arrays

__global__ void init_arrays(double coded_bits_d, 

										double received_bits_d,

										double L_r_d,

										double L_v_d,

										double *N_0_d,

										double *SNR_d)

{

	 int i = threadIdx.x;

	 int j = threadIdx.y;

	 //just add noise per SNR

	 // this loop gets all of the symbol bits into received bits

	 double noise, E;

	 for(int i=tidi;i<SIZEY;i++) {

						 

		  for(int j=tidj;j<SIZEX;j++) {

			 noise = GaussianRV(N_0_d);

			 E = ((*SNR_d)*(*N_0_d)*4)/8;

			 received_bits_d[i][j] = sqrt(E)*coded_bits_d[i][j] + noise;

			 L_r_d[i][j] = 4*received_bits_d[i][j]*(*SNR_d);

			 L_v_d[i][j] = 0;

		  }

	 }					 

}

Here is the GaussianRV kernel for reference

__device__ double GaussianRV(double *N_0)

{

   double v1,v2;

   double w;

   double y1;

   double y2;

   int use_last = 0;

if (use_last == 0)

   {

	  do

	  {

		 v1 = 2.0*uniformRV() - 1.0;

		 v2 = 2.0*uniformRV() - 1.0;

		 w = v1*v1 + v2*v2;

	  }while (w >= 1.0);

		 

	  w = sqrt((-2.0*log(w))/w);

		 

	  y1 = v1*w;

	  y2 = v2*w;

		 

	  use_last = 1;

   }

	  

   else

   {

	  y1 = y2;

		   

	  use_last = 0;

   }

return (y1*sqrt(*N_0/2));

}

But the code just seg faults.

This is the way I am declaring those arrays

//create 2-d arrays on CUDA

	double *received_bits_d, *L_v_d, *L_h_d, *L_r_d, *decoded_bits_d;

	

	//received_bits_d

	cudaMalloc((void**)&received_bits_d, sizeof(double) * SIZEY * SIZEX);

	//L_v_d

	cudaMalloc((void**)&L_v_d, sizeof(double) * SIZEY * SIZEX);

	

	//L_h_d

	cudaMalloc((void**)&L_h_d, sizeof(double) * SIZEY * SIZEX);

	

	//L_r_d 

	cudaMalloc((void**)&L_r_d, sizeof(double) * SIZEY * SIZEX);

	

	//decoded_bits_d

	cudaMalloc((void**)&decoded_bits_d, sizeof(double) * SIZEY * SIZEX);

and this is how I copied the original bits into the coded_bits_d array

//create the matrix populated with a sample coded bits //////////

	// now using dynamically allocated arrays with pointers

	double **coded_bits_h;

	coded_bits_h = new double*;

	for(i = 0; i < SIZEY; i++) {

		  coded_bits_h[i] = new double ;

	} 

...some data insertion into array...

//create a 2-d array on CUDA

	double *coded_bits_d;

	cudaMalloc((void**)&coded_bits_d, sizeof(double) * SIZEY * SIZEX);

	

	//copy the contents of the coded bits from host to device

	cudaMemcpy(coded_bits_d, coded_bits_h, sizeof(double) * SIZEY * SIZEX, cudaMemcpyHostToDevice);

and here is how I am calling the kernel:

//call CUDA to generate the packet array

   init_arrays<<<BLOCK_NUM, THREAD_NUM>>>((double(*) )coded_bits_d, 

										  (double(*) )received_bits_d,

										  (double(*) )L_r_d, 

										  (double(*) )L_v_d, 

										  N_0_d, SNR_d);

	

   //error check for kernel

   if (cudaGetLastError() != cudaSuccess) {

	  cout << "kernel launch failed\n";

   }

   cudaThreadSynchronize();

   if (cudaGetLastError() != cudaSuccess) {

	  cout << "kernel execution failed\n";

   }

While I am not trying to get my stuff just coded for me, I am really lost as to how to make use of the GPU. There is no error message so I am assuming the kernel launches and runs ok, but the program runs a bit and then seg faults. I am thinking I am not really using parallel computing techniques and making the kernel do something that is ordinarily done on the CPU. Not sure. But I am thinking it must be something to do with how i am declaring, allocating, and using the 2-d arrays. Most of the reference material online are 1-d arrays, should I be considering re-doing my code to use only 1-d arrays? i rather keep it 2-d but theres not that much documentation on using 2-d with CUDA.

I am especially confused on how to index the 2-d arrays with blockIdx.x, blockDim.x,threadIdx.x

Please Help!

One thing you are doing wrong is you are allocating coded_bits_h as a dynamically allocated array of dynamically allocated arrays. For cuda you really want contiguous buffers so the runtime can copy contiguous regions of memory to the device. I would guess that is where the runtime is segfaulting.

well, i tried this and it doesnt work either, still seg-faults

double coded_bits_h;

...insert data into array manually...

//create a 2-d array on CUDA

	double *coded_bits_d;

	cudaMalloc((void**)&coded_bits_d, sizeof(double) * SIZEY * SIZEX);

	

	//copy the contents of the coded bits from host to device

	cudaMemcpy(coded_bits_d, coded_bits_h, sizeof(double) * SIZEY * SIZEX, cudaMemcpyHostToDevice);

How is one suppose to create a 2-d host array, cudaMalloc a 2-d device array, and then cudaMemCpy the 2-d host array to the one on device? and then to use in kernel?

Maybe my understanding of CUDA is fundementally wrong, but this is how its done on typical 1-d arrays, i dont knwo why its not working on here.

Please Help!

[quote name=‘flagman5’ post=‘574729’ date=‘Aug 6 2009, 05:35 AM’]

well, i tried this and it doesnt work either, still seg-faults

[codebox]#define ERROR_CHECK() error_check(__LINE__)

void error_check(int line)

{

cudaError_t cudaError;

cudaError = cudaGetLastError();

if( cudaError != cudaSuccess )

{

	printf("CUDA Runtime API Error reported : %s in file %s on line %d.\n", cudaGetErrorString(cudaError), __FILE__, line);

}

}

[/codebox]

if you call ERROR_CHECK(); after each cuda function then if there is an error it will tell you what file and around what line it was found (the line number references the call to ERROR_CHECK()…not an actual line that has a problem).

Where is N_0_d and SNR_d allocated on the device? you didn’t seem to include that in your snippet of code from an earlier post.

I want to pass a multi dimensional array to kernel, and unfortunately it is not in the format of A[N][N],
it is something like A[N][M].

How can i call kernel now.

MatAdd<<<1, dimBlock>>>((???);

Find a worked example below. Please note that in the interest of conciseness I have omitted error checking. For your code you would want to check the return status of every CUDA API call and kernel launch.

Multi-dimensional arrays can be stored in a contiguous chunk of memory either following a row-major or column-major layout (see Wikipedia for an explanation of the terms). Here I have chosen row major which is the layout used by C and C++. Note that CUBLAS uses the column-major storage convention instead for easy interoperability with existing linear algebra software.

#include <stdio.h>
#include <stdlib.h>

#define N 3
#define M 4
#define A(i,j) A[(i)*cols+(j)]  // row-major layout
#define B(i,j) B[(i)*cols+(j)]  // row-major layout
#define C(i,j) C[(i)*cols+(j)]  // row-major layout

__global__ void MatAdd (const double *A, const double *B, double *C,
                        int rows, int cols)
{
    int row = threadIdx.y;
    int col = threadIdx.x;
    if ((row < rows) && (col < cols)) {
        C(row,col) = A(row, col) + B(row, col);
    }
}

int main (void)
{    
    double A [N][M] = {{1, 2, 3, 4},
                       {5, 6, 7, 8},
                       {9, 0, 1, 2}};
    double B [N][M] = {{3, 3, 3, 3},
                       {1, 1, 1, 1},
                       {0, 0, 0, 0}};
    double *C;
    double *A_d = 0, *B_d = 0, *C_d = 0;
    int rows = N;
    int cols = M;
    dim3 blockDim(M, N);
    C = (double *)malloc (sizeof(*C)*N*M);
    cudaMalloc ((void**)&A_d, sizeof(*A_d)*N*M);
    cudaMalloc ((void**)&B_d, sizeof(*B_d)*N*M);
    cudaMalloc ((void**)&C_d, sizeof(*C_d)*N*M);
    cudaMemcpy (A_d, A, sizeof(*A_d)*N*M, cudaMemcpyHostToDevice);
    cudaMemcpy (B_d, B, sizeof(*B_d)*N*M, cudaMemcpyHostToDevice);
    MatAdd<<<1,blockDim>>>(A_d, B_d, C_d, rows, cols);
    cudaMemcpy (C, C_d, sizeof(*C)*N*M, cudaMemcpyDeviceToHost);
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < M; j++) {
            printf ("%g ", C(i,j));
        }
        printf ("\n");
    }
    cudaFree (C_d);
    cudaFree (B_d);
    cudaFree (A_d);
    free (C);
    return EXIT_SUCCESS;
}

The output of this program should look like so:

4 5 6 7
6 7 8 9
9 0 1 2