2D host memory allocation

Hello, I want to simulate 2D flows with CUDA (Navier-Stokes-Equations).

I researched the Forum and found out how to copy 2D allocated arrays to the device and vice versa. This worked for me with cudaMemcpy2D.

Following code should be the shortest to demonstrate problem:

#include<stdio.h>

#include<assert.h>

__global__ void matAdd(float *A_d, float *B_d, size_t pitch_A, size_t pitch_B)

{ 

  int idx=blockIdx.x*blockDim.x+threadIdx.x;

  int idy=blockIdx.y*blockDim.y+threadIdx.y;

  //add +1 to every element

  *((float *)((char *)B_d+idy*pitch_B)+idx)=*((float *)((char *)A_d+idy*pitch_A)+idx)+1;

}

int main()

{ 

  //size of 2D square array

  int N=4;

//device memory

  float *A_d, *B_d;

size_t size=N*sizeof(float);

//simple allocated host memory

  float A_h[N][N], B_h[N][N];																																													  

//allocate device memory

  size_t pitch_A, pitch_B;

  cudaMallocPitch((void **)(&A_d), &pitch_A, size, N);

  cudaMallocPitch((void **)(&B_d), &pitch_B, size, N);

//write some data in host memory

  for (int j=0; j<N; j++) for (int i=0; i<N; i++) A_h[i][j]=i*j;

//copy host to device memory

  cudaMemcpy2D(A_d, pitch_A, A_h, size, size, N, cudaMemcpyHostToDevice);

//Kernel invocation

  dim3 dimBlock(N/2, N/2);

  dim3 nBlocks(2, 2);

  matAdd<<<nBlocks, dimBlock>>>(A_d, B_d, pitch_A, pitch_B);

//copy device to host memory

  cudaMemcpy2D(B_h, size, B_d, pitch_B, size, N, cudaMemcpyDeviceToHost);

//print results

  for (int j=0; j<N; j++) for (int i=0; i<N; i++) printf("%d\t%d\t%f\t%f\n", i, j, A_h[i][j], B_h[i][j]);

//free device memory

  cudaFree(B_d);

  cudaFree(A_d);

return 0;

}

These 2D arrays are allocated at compile time (code notation: float A[N][N]).

The problem is that I have to use runtime-allocated 2D arrays (double pointers).

Therefore I simply changed source-code in following way: deleted “float A[N][N]”-declarations and added runtime-allocation and freeing code.

#include<stdio.h>

#include<assert.h>

__global__ void matAdd(float *A_d, float *B_d, size_t pitch_A, size_t pitch_B)

{ 

  int idx=blockIdx.x*blockDim.x+threadIdx.x;

  int idy=blockIdx.y*blockDim.y+threadIdx.y;

  //add +1 to every element

  *((float *)((char *)B_d+idy*pitch_B)+idx)=*((float *)((char *)A_d+idy*pitch_A)+idx)+1;

}

int main()

{ 

  //size of square array

  int N=4;

//device memory

  float *A_d, *B_d;

size_t size=N*sizeof(float);

//runtime allocated host memory

  float **A_h, **B_h;

  A_h=(float **)malloc(N*sizeof(float *));

  for (int j=0; j<N; j++) *(A_h+j)=(float *)malloc(N*sizeof(float));

  B_h=(float **)malloc(N*sizeof(float *));

  for (int j=0; j<N; j++) *(B_h+j)=(float *)malloc(N*sizeof(float));

//allocate device memory

  size_t pitch_A, pitch_B;

  cudaMallocPitch((void **)(&A_d), &pitch_A, size, N);

  cudaMallocPitch((void **)(&B_d), &pitch_B, size, N);

//write some data in host memory

  for (int j=0; j<N; j++) for (int i=0; i<N; i++) A_h[i][j]=i*j;

//copy host to device memory

  cudaMemcpy2D(A_d, pitch_A, A_h, size, size, N, cudaMemcpyHostToDevice);

//Kernel invocation

  dim3 dimBlock(N/2, N/2);

  dim3 nBlocks(2, 2);

  matAdd<<<nBlocks, dimBlock>>>(A_d, B_d, pitch_A, pitch_B);

//copy device to host memory

  cudaMemcpy2D(B_h, size, B_d, pitch_B, size, N, cudaMemcpyDeviceToHost);

//print results

  for (int j=0; j<N; j++) for (int i=0; i<N; i++) printf("%d\t%d\t%f\t%f\n", i, j, A_h[i][j], B_h[i][j]);

//free device memory

  cudaFree(B_d);

  cudaFree(A_d);

//free host memory

  for (int j=0; j<N; j++) free(*(B_h+j));

  free(B_h);

  for (int j=0; j<N; j++) free(*(A_h+j));

  free(A_h);

return 0;

}

Unfortunately that doesnt work, I got “Segmentation fault” error.

Is CUDA able to deal with this double-pointer notation in cudaMemcpy2D?

Any help would be appreciated, thank you!

No, cudaMemcpy2D deals with copying subregions of continuous memory, not 2D arrays created by two dereferences. It only worked the first time because your compiler was making a[N][N] continuous.

Thanks for your reply, now I see the issue!
My current solution is to declare an 1D runtime-allocated-array (created with a pointer) and copy my double-pointer-2D-array to this 1D-array (with offsets). This 1D-array is input of cudaMemcpy2D.
I even tried out, it works! :)
I think the copy process is a little bit cost-intensive. It will cost O(N^2) operations because every single element has to be copied.
Is there a more efficient way to copy double-pointer-2D-array structures to single-pointer-1D-array structures?

Can you please post your solution here? I have still not figured out a way for doing a 2D array on CUDA.