How to cudaMalloc two-dimensional array ?

I dont understand much of your code.

You are using “temp” for storing both host and device pointers… :-(

When you store device pointers, you are actuall over-writing the host pointers…

Where are you copying out the data? I dont see you copying out the device pointers to d_ptr as well…

I am using temp to allocate rows of d_Ptr. I followed your example in earlier post of this thread. As an alternative I used

for(i = 0; i < 5; i++)

  cudaMalloc((void**)&d_Ptr[i] , 3 * sizeof(float)); 

Instead of 

for(i = 0; i < 5; i++)

  cudaMalloc((void**)&temp[i] , 3 * sizeof(float));

This I doubt and I saw while debugging.

The example I posted is to just allocate and de-allocate memory. I am neither passing in any content or retrieving from device pointer. In this connection I need your help.

As I tried different experiments with cudaMemcpy2D in both directions. ( h->D and D->H) .

The attempt of using copy contents to or from results in seg fault when trying to free memory from device pointer.

Can you illustrate how to copy to and copy from device pointer to host array to check. That wll be a great help. thanks a tonne.

Allocate 2D array on device

float **device2DArray;

   float *h_temp[5];

// Create 2D Array

  cudaMalloc((void **)&device2DArray, 5*sizeof(float *));

  for(int i=0; i<5; i++)

  {

	cudaMalloc( (void **)&h_temp[i], 3*sizeof(float));

  }

  cudaMemcpy(device2DArray, h_temp, 5*sizeof(float *), cudaMemcpyHostToDevice);

// Do not destroy the contents of h_temp

 // So, we dont need to copy the pointers from the device again and again. We will hold a copy of the row pointers in h_temp

Copy host** array onto device

float **cpuArray = someValid2DHostarray;

for(int i=0; i<5; i++)

{

   cudaMemcpy(h_temp[i], cpuArray[i], 3*sizeof(float), cudaMemcpyHostToDevice);

}

Now, device2DArray pointer in GPU is a true 2D array created on GPU with data…

The device2DArray can be passed to a kernel as “float **device2DArray” and the kernel can access it like “device2DArray[i][j]”.

Once done, to copy out the data back to host:

float **cpuArray = someValid2DHostarray;

for(int i=0; i<5; i++)

{

   cudaMemcpy(cpuArray[i], h_temp[i], 3*sizeof(float), cudaMemcpyDeviceToHost);

}

Now cpuArray[i][j] will have same contents as device2DArray[i][j].

HTH

1 Like

Million Thanks Sarnath, I was looking for this. but after analysing your solution, I think this will involve lot of transfer time between CPU->GPU & vice-versa. To offset this I used cudaMemcpy2D, as it copies everything in one shot.

Can you think of a solution where cudaMemcpy2D can be used???

No problem.

Have never used cudaMemcpy2D before.

but since you have the code now, I think the manual should help you out on this…

I think cuda2DArrays allocate memory as one single 1D array (just like how lutormx said…) and copy it out…

Check the manual… You should be able to find out.

Thank you for your time and efforts Sarnath, i really appreciate your earnest help. :).

I promise, i shall post a good solution that may help others.Thanks again for sharing your knowledge.

No problem! Glad I was able to help a person.

Good Luck!

I uswd your code but it is giving some error: argument of type “int” is incompatible with parameter of type “void *”

cudaMalloc expects a “void**” first argument. Cast it appropriately.

I got a problem.

I had tried to allocate 2Dhost and 2Ddevice array. And then try to use GPU to do some calculation in there.

Badluck, I can not figure out where I miss.

/*

 *	CONSIDER 4 BLOCKS, WITH 4 THREADS/BLOCS

 *

*/

#include"../common/book.h"

#define N 8

__host__ double** Make2DDoubleArray(int arraySizeX, int arraySizeY) 

{

	double** theArray;

	theArray = (double**) malloc(arraySizeX*sizeof(double*));

	for (int i = 0; i < arraySizeX; i++)

   	theArray[i] = (double*) malloc(arraySizeY*sizeof(double));

   	return theArray;

}

__host__ void showxx(double **A)

{	int i,j;

	for(i=0;i<N;i++){

		printf("\n");

		for(j=0;j<N;j++){

		printf("%.3f ",A[i][j]);;

		}

	}

}

__host__ void init2D(double **A)

{	int i,j;

	for(i=0;i<N;i++){

		for(j=0;j<N;j++){

			A[i][j] = i+j;

		}

	}

}

__global__ void add( double **A)

{	

	int x = threadIdx.x + blockIdx.x * blockDim.x;

    int y = threadIdx.y + blockIdx.y * blockDim.y;

    int offset = x + y * blockDim.x * gridDim.x;

	A[x][y] = 4;

}

int main(void)

{	

	double** A = Make2DDoubleArray(N, N);

	double **d_2Darray;

	double *h_temp[N];

	int i;

	init2D(A);

	showxx(A);

	

	  // Create 2D Array on device

  cudaMalloc((void **)&d_2Darray, N*sizeof(double *));

  for(int i=0; i<N; i++)

  {

        cudaMalloc( (void **)&h_temp[i], N*sizeof(double));

  }

  cudaMemcpy(d_2Darray, h_temp, N*sizeof(double *), cudaMemcpyHostToDevice);

	//Copy host** arrayn to device

	for(int i=0; i<N; i++)

	{

	cudaMemcpy(h_temp[i], A[i], N*sizeof(double), cudaMemcpyHostToDevice);

	}

	add<<<4,4>>>(d_2Darray);

	printf("===========================");

	

	//copy back to host

	for(int i=0; i<N; i++)

	{

    cudaMemcpy(A[i], h_temp[i], N*sizeof(double), cudaMemcpyDeviceToHost);

	}

	showxx(A);

	

	for (i = 0; i < N; i++){

   		free(A[i]);

	}

	free(A);

	

	

	return 0;

}

Hi,
You have not allocated a 2D block…
<<<4,4>>> means that you are spawning 4 blocks with 4 threads each.
Thus a total of 16 threads in X dimension.
Y dimension is always only 1.
Going by your kernel source, “y” wiall always be “1” and hence only column 1 will be populated for some elements.
Whats the answer that you are getting?
btw,
Your allocation of 2D arrays is correct. Nice.

I got a problem.

I had tried to allocate 2Dhost and 2Ddevice array. And then try to use GPU to do some calculation in there.

Badluck, I can not figure out How to deal with 41st line ?

I know that 1D array is better than 2D, but n00b (me) prefer this one.

Here is my error.

C:\cuda_by_example\sarit>nvcc matadd.cu

matadd.cu

matadd.cu(40): warning: variable "offset" was declared but never referenced

tmpxft_00000f5c_00000000-3_matadd.cudafe1.gpu

tmpxft_00000f5c_00000000-8_matadd.cudafe2.gpu

matadd.cu

matadd.cu(40): warning: variable "offset" was declared but never referenced

./matadd.cu(41): Warning: Cannot tell what pointer points to, assuming global me

mory space

ptxas C:/Users/7-64/AppData/Local/Temp/tmpxft_00000f5c_00000000-4_matadd.ptx, li

ne 594; warning : Double is not supported. Demoting to float

tmpxft_00000f5c_00000000-3_matadd.cudafe1.cpp

tmpxft_00000f5c_00000000-14_matadd.ii

C:\cuda_by_example\sarit>
/*

 *	CONSIDER 4 BLOCKS, WITH 4 THREADS/BLOCS

 *

*/

#include"../common/book.h"

#define N 8

__host__ double** Make2DDoubleArray(int arraySizeX, int arraySizeY) 

{

	double** theArray;

	theArray = (double**) malloc(arraySizeX*sizeof(double*));

	for (int i = 0; i < arraySizeX; i++)

   	theArray[i] = (double*) malloc(arraySizeY*sizeof(double));

   	return theArray;

}

__host__ void showxx(double **A)

{	int i,j;

	for(i=0;i<N;i++){

		printf("\n");

		for(j=0;j<N;j++){

		printf("%.3f ",A[i][j]);;

		}

	}

}

__host__ void init2D(double **A)

{	int i,j;

	for(i=0;i<N;i++){

		for(j=0;j<N;j++){

			A[i][j] = i+j;

		}

	}

}

__global__ void adda( double **A)

{	

	int x = threadIdx.x + blockIdx.x * blockDim.x;

    int y = threadIdx.y + blockIdx.y * blockDim.y;

    int offset = x + y * blockDim.x * gridDim.x;

	A[x][y] = 4;                 //HERE IS 41ST LINE

}

int main(void)

{	

	double** A = Make2DDoubleArray(N, N);

	double **d_2Darray;

	double *h_temp[N];

	int i;

	init2D(A);

	showxx(A);

	

        // Create 2D Array on device

        cudaMalloc((void **)&d_2Darray, N*sizeof(double *));

        for(int i=0; i<N; i++)

        {

           cudaMalloc( (void **)&h_temp[i], N*sizeof(double));

        }

        cudaMemcpy(d_2Darray, h_temp, N*sizeof(double *), cudaMemcpyHostToDevice);

	//Copy host** arrayn to device

	for(int i=0; i<N; i++)

	{

	cudaMemcpy(h_temp[i], A[i], N*sizeof(double), cudaMemcpyHostToDevice);

	}

	adda<<<4,4>>>(d_2Darray);

	printf("===========================");

	

	//copy back to host

	for(int i=0; i<N; i++)

	{

              cudaMemcpy(A[i], h_temp[i], N*sizeof(double), cudaMemcpyDeviceToHost);

	}

	showxx(A);

	

	for (i = 0; i < N; i++){

   		free(A[i]);

	}

	free(A);

	

	

	return 0;

}

Its an advisory warning. By default, compiler will assume global memory space which in your case (most cases) is good.
No need to worry about that.

Thank you for quick response.

I used to wonder about the warning that may affect to my result.

Here is result from adda.cu.

After ============= is the result that I copy back from GPU.

It should be 4 as the 41st line command.

?

C:\cuda_by_example\sarit>nvcc -arch=sm_13 adda.cu

adda.cu

adda.cu(38): warning: variable "offset" was declared but never referenced

tmpxft_00000ff8_00000000-3_adda.cudafe1.gpu

tmpxft_00000ff8_00000000-8_adda.cudafe2.gpu

adda.cu

adda.cu(38): warning: variable "offset" was declared but never referenced

./adda.cu(39): Warning: Cannot tell what pointer points to, assuming global memo

ry space

tmpxft_00000ff8_00000000-3_adda.cudafe1.cpp

tmpxft_00000ff8_00000000-14_adda.ii

C:\cuda_by_example\sarit>a.exe

0.000 1.000 2.000 3.000 4.000 5.000 6.000 7.000

1.000 2.000 3.000 4.000 5.000 6.000 7.000 8.000

2.000 3.000 4.000 5.000 6.000 7.000 8.000 9.000

3.000 4.000 5.000 6.000 7.000 8.000 9.000 10.000

4.000 5.000 6.000 7.000 8.000 9.000 10.000 11.000

5.000 6.000 7.000 8.000 9.000 10.000 11.000 12.000

6.000 7.000 8.000 9.000 10.000 11.000 12.000 13.000

7.000 8.000 9.000 10.000 11.000 12.000 13.000 14.000 ===========================

0.000 1.000 2.000 3.000 4.000 5.000 6.000 7.000

1.000 2.000 3.000 4.000 5.000 6.000 7.000 8.000

2.000 3.000 4.000 5.000 6.000 7.000 8.000 9.000

3.000 4.000 5.000 6.000 7.000 8.000 9.000 10.000

4.000 5.000 6.000 7.000 8.000 9.000 10.000 11.000

5.000 6.000 7.000 8.000 9.000 10.000 11.000 12.000

6.000 7.000 8.000 9.000 10.000 11.000 12.000 13.000

7.000 8.000 9.000 10.000 11.000 12.000 13.000 14.000

Try setting values to some arbitrary number greater than 8 in your kernel and then print the results.
OR there could be some problems with your cudaMemcpys failing. DO an erro check

A[x][y] = 99;                 //HERE IS 41ST LINE

And

adda<<<4,16>>>(d_2Darray);

I changed it as your advice. Problem is still the same.

I do not know how to do an erro check. Could you give me some advice ?

Or do I need to use cudaMemcpy2D instead ?

http://www.clear.rice.edu/comp422/resources/cuda/html/group__CUDART__MEMORY_g17f3a55e8c9aef5f90b67cdf22851375.html#g17f3a55e8c9aef5f90b67cdf22851375

It has only reference command without any example. I am totally blank when it without concrete example.

My goal is lattice model therefore element accessing is crucial for me.

The problem is with your kernel dimensions.
I just commented out “A[y] = xxx;” and added “A[0][0] = 5.;” in the kernel and it just works fine.

The launch fails with A[y]. Because blockIdx.xblockDim.x + threadIdx.x can result in values of 34 + 3 = 15 – which is well past the allocated pointers… Hence invalid pointer causes kernel to abort… Simple…

Thats why I have been asking from morning to pass 2 dimensional block.
Always check bounds before accessing…
Like say:
if ((x > 8) && (y > 8))
A[y] = …

You can use:
"cudaError_t err;


adda<<<…>>>(…);
err = cudaThreadSynchronize();
if (err != cudaSuccess)
printf(“Kernel launch failed…\n”);
"
Technically the error returned could be from some other GPU operation before the kernel launch too.
Read the CUDA manual

I got it works!

Thank you so much Sarnath.

I confirmed I had read manual everytime, I have poor software engineering background.

Frankly, I grad from Physics. Thus my understanding is limited.

Sorry to take pity on me. External Image

I changed some of my adda

First

__global__ void adda( double **A)

{       

    int x = threadIdx.x + blockIdx.x * blockDim.x;

    int y = threadIdx.y + blockIdx.y * blockDim.y;

    //int offset = x + y * blockDim.x * gridDim.x;

    if (x < N && y < N){				//2nd day revise

		//A[x][y] = 4.0  ;                 //HERE IS 41ST LINE

		//A[x][y] = blockIdx.x;

		//A[x][y] = blockIdx.y;

		//A[x][y] = threadIdx.x;

		//A[x][y] = threadIdx.y;

	}		

}

Second

dim3    blocks(N/4,N/4);		//2nd day revise

	dim3    threads(4,4);			//2nd day revise

Third

adda<<<blocks,threads>>>(d_2Darray);	//2nd day revise

Result adda function when uncomment selected line out.

A[y] = 4.0.

============GPU===============

4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0

4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0

4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0

4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0

4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0

4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0

4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0

4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0

A[y] = blockIdx.x; I got

============GPU===============

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0

1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0

1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0

1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0

A[y] = blockIdx.y;

============GPU===============

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0

1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0

1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0

1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0

A[y] = threadIdx.x;

============GPU===============

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0

2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0

3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0

2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0

3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0

A[y] = threadIdx.y;

============GPU===============

0.0 1.0 2.0 3.0 0.0 1.0 2.0 3.0

0.0 1.0 2.0 3.0 0.0 1.0 2.0 3.0

0.0 1.0 2.0 3.0 0.0 1.0 2.0 3.0

0.0 1.0 2.0 3.0 0.0 1.0 2.0 3.0

0.0 1.0 2.0 3.0 0.0 1.0 2.0 3.0

0.0 1.0 2.0 3.0 0.0 1.0 2.0 3.0

0.0 1.0 2.0 3.0 0.0 1.0 2.0 3.0

0.0 1.0 2.0 3.0 0.0 1.0 2.0 3.0

CUDA erro check is in progress krub. :]

Highly regards

Glad yu got it working…
But please be aware that having 2D arrays in GPU causes one xtra pointer fetch which can be REALLY costly if done repeatedly… So, if you are gonna use this inside LOOPs then yu may want to re-think twice.
No performance sensitive code ever uses 2D arrays this way…
You will be fine with a single dimensinal array to represent the 2D array.

GPUs have very heavy flops. So, it will be ecnomical to spend some FLOPs in finding the address within the 1D array.
Memry accesses are costly…It can cost you several hundred cycles…

There are other aspects like coalesced memry access, shared memory bank conflicts which play a role in affecting performance. You can learn more frm the manual…

Programming is very simple. You can experiment easily with the computer… and offers a much more controlled environment for experimentation… unlike physics which can be much tougher.

Wish you good Luck,

Take care,
best Regards,
Sarnath

@sarnath

Hi! I was following your post to allocate a 2D memory on GPU and did something like you have mentioned. But I am not sure what I did was correct or not and moreover I am still not very much clear about the freeing the memory occupied by the 2D array from GPU. I will post the code that I have written below but it would be very much helpful if you could post a small sample code for the whole procedure. Like allocating 2D array,copying to device,kernel function operation,copying back the 2D array to host and freeing the memory in a single program then it would be very much helpful. Anyway I will post my code below that I am trying to write :

#include "all_headers.h"

__global__ void test_kernel(int **dev_env_t)
{
	  int tidx = blockDim.x +blockIdx.x*threadIdx.x;
	  int tidy = blockDim.x+blockIdx.x*threadIdx.y;

	  dev_env_t[tidx][tidy] = dev_env_t[tidx][tidy] +10;
}

void test_func_1(void)
{
	int i,k;
	int **env_t;
	int **dev_env_t;

	int env_end =30;
						 
	int *temp[30];

	env_t =(int **) malloc(env_end * sizeof *env_t);
	for(k=0;k<env_end;k++)
	{env_t[k] = (int *)malloc(env_end* env_end* sizeof *env_t[0]);																															   
	}

	for (k = 1; k < env_end; ++k)
		env_t[k] = env_t[k - 1] + env_end;

	memset(*env_t, 0, env_end * env_end* sizeof **env_t);

	 cudaMalloc((void **)&dev_env_t,env_end*sizeof(int));

	 for(i=0;i<env_end;i++)
	 {
			 cudaMalloc((void **)&temp[i],env_end*sizeof(int));	 }
																										  
	 cudaMemcpy(dev_env_t,temp,env_end*sizeof(int),cudaMemcpyHostToDevice);

	 for (i=0;i<env_end;i++)
	 {	 cudaMemcpy(temp[i],env_t[i],env_end*sizeof(int),cudaMemcpyHostToDevice);
	 }

	 dim3 gridDim(1,1);
	 dim3 blockDim(env_end,env_end,1);

	 test_kernel<<<gridDim,blockDim>>>(dev_env_t);

	 for (i=0;i<env_end;i++)
	{ cudaMemcpy(env_t[i],temp[i],env_end*sizeof(int),cudaMemcpyDeviceToHost);
	 }

	for (i=0;i<env_end;i++)
	{ free(env_t[i]);
	}
	free(env_t);

}

Notice : That I haven’t freed the 2D memory of the GPU as it is still not very much clear to me.