Hello, I want to simulate 2D flows with CUDA (Navier-Stokes-Equations).
I researched the Forum and found out how to copy 2D allocated arrays to the device and vice versa. This worked for me with cudaMemcpy2D.
Following code should be the shortest to demonstrate problem:
#include<stdio.h>
#include<assert.h>
__global__ void matAdd(float *A_d, float *B_d, size_t pitch_A, size_t pitch_B)
{
int idx=blockIdx.x*blockDim.x+threadIdx.x;
int idy=blockIdx.y*blockDim.y+threadIdx.y;
//add +1 to every element
*((float *)((char *)B_d+idy*pitch_B)+idx)=*((float *)((char *)A_d+idy*pitch_A)+idx)+1;
}
int main()
{
//size of 2D square array
int N=4;
//device memory
float *A_d, *B_d;
size_t size=N*sizeof(float);
//simple allocated host memory
float A_h[N][N], B_h[N][N];
//allocate device memory
size_t pitch_A, pitch_B;
cudaMallocPitch((void **)(&A_d), &pitch_A, size, N);
cudaMallocPitch((void **)(&B_d), &pitch_B, size, N);
//write some data in host memory
for (int j=0; j<N; j++) for (int i=0; i<N; i++) A_h[i][j]=i*j;
//copy host to device memory
cudaMemcpy2D(A_d, pitch_A, A_h, size, size, N, cudaMemcpyHostToDevice);
//Kernel invocation
dim3 dimBlock(N/2, N/2);
dim3 nBlocks(2, 2);
matAdd<<<nBlocks, dimBlock>>>(A_d, B_d, pitch_A, pitch_B);
//copy device to host memory
cudaMemcpy2D(B_h, size, B_d, pitch_B, size, N, cudaMemcpyDeviceToHost);
//print results
for (int j=0; j<N; j++) for (int i=0; i<N; i++) printf("%d\t%d\t%f\t%f\n", i, j, A_h[i][j], B_h[i][j]);
//free device memory
cudaFree(B_d);
cudaFree(A_d);
return 0;
}
These 2D arrays are allocated at compile time (code notation: float A[N][N]).
The problem is that I have to use runtime-allocated 2D arrays (double pointers).
Therefore I simply changed source-code in following way: deleted “float A[N][N]”-declarations and added runtime-allocation and freeing code.
#include<stdio.h>
#include<assert.h>
__global__ void matAdd(float *A_d, float *B_d, size_t pitch_A, size_t pitch_B)
{
int idx=blockIdx.x*blockDim.x+threadIdx.x;
int idy=blockIdx.y*blockDim.y+threadIdx.y;
//add +1 to every element
*((float *)((char *)B_d+idy*pitch_B)+idx)=*((float *)((char *)A_d+idy*pitch_A)+idx)+1;
}
int main()
{
//size of square array
int N=4;
//device memory
float *A_d, *B_d;
size_t size=N*sizeof(float);
//runtime allocated host memory
float **A_h, **B_h;
A_h=(float **)malloc(N*sizeof(float *));
for (int j=0; j<N; j++) *(A_h+j)=(float *)malloc(N*sizeof(float));
B_h=(float **)malloc(N*sizeof(float *));
for (int j=0; j<N; j++) *(B_h+j)=(float *)malloc(N*sizeof(float));
//allocate device memory
size_t pitch_A, pitch_B;
cudaMallocPitch((void **)(&A_d), &pitch_A, size, N);
cudaMallocPitch((void **)(&B_d), &pitch_B, size, N);
//write some data in host memory
for (int j=0; j<N; j++) for (int i=0; i<N; i++) A_h[i][j]=i*j;
//copy host to device memory
cudaMemcpy2D(A_d, pitch_A, A_h, size, size, N, cudaMemcpyHostToDevice);
//Kernel invocation
dim3 dimBlock(N/2, N/2);
dim3 nBlocks(2, 2);
matAdd<<<nBlocks, dimBlock>>>(A_d, B_d, pitch_A, pitch_B);
//copy device to host memory
cudaMemcpy2D(B_h, size, B_d, pitch_B, size, N, cudaMemcpyDeviceToHost);
//print results
for (int j=0; j<N; j++) for (int i=0; i<N; i++) printf("%d\t%d\t%f\t%f\n", i, j, A_h[i][j], B_h[i][j]);
//free device memory
cudaFree(B_d);
cudaFree(A_d);
//free host memory
for (int j=0; j<N; j++) free(*(B_h+j));
free(B_h);
for (int j=0; j<N; j++) free(*(A_h+j));
free(A_h);
return 0;
}
Unfortunately that doesnt work, I got “Segmentation fault” error.
Is CUDA able to deal with this double-pointer notation in cudaMemcpy2D?
Any help would be appreciated, thank you!