Hi all,
I am using cudaMallocPitch along with cudaMemcopy2D to get an array to the GPU. I can get the array onto the GPU and back off with success. The following code is what I use to do this:
cudaMallocPitch((REAL**)&d_phi0, &pitch, (size_t)(sxsizeof(REAL)), sy);
cudaMemcpy2D( d_phi0, pitch, &phi.Array[0][0][0], (size_t)(sxsizeof(REAL)), sxsizeof(REAL),sy, cudaMemcpyHostToDevice);
cudaMemcpy2D( &h_phi.Array[0][0], (size_t)(sxsizeof(REAL)), d_phi0, pitch, sx*sizeof(REAL), sy, cudaMemcpyDeviceToHost);
My problem is in trying to index the array. As I understand from reading the programing guide, pitch is the length in x direction(number of columns). To get to the first element in the second row of the array I would do the following.
d_phi0[pitch]=BLAH;
This sounds easy, however the pitch returned by cudaMallocPitch is wrong. to get the first element of the second row I must use
d_phi0[pitch/8]=BLAH;
The real pitch in my case is pitch/8. What gives? The pitch returned to me is 512. The size of my array in the x direction is 42. pitch/8 corresponds to 64(which seems more reasonable than 512 anyway considereing my array size).
Thank you very much in advance.
Full Code (condensed):
const int sx = 42;
const int sy = 21;
REAL* d_phi0 = NULL, *d_phi1=NULL, *d_phi2=NULL;
size_t pitch = NULL;
//cudaInitializeArrays(phi, d_phi0, d_phi1, d_phi2, pitch);
cudaMallocPitch((REAL**)&d_phi0, &pitch, (size_t)(sx*sizeof(REAL)), sy);
cudaMallocPitch((REAL**)&d_phi1, &pitch, (size_t)(sx*sizeof(REAL)), sy);
cudaMallocPitch((REAL**)&d_phi2, &pitch, (size_t)(sx*sizeof(REAL)), sy);
cudaMemcpy2D( d_phi0, pitch, &phi.Array[0][0][0], (size_t)(sx*sizeof(REAL)), sx*sizeof(REAL),sy, cudaMemcpyHostToDevice);
cudaMemcpy2D( d_phi1, pitch, &phi.Array[1][0][0], (size_t)(sx*sizeof(REAL)), sx*sizeof(REAL),sy, cudaMemcpyHostToDevice);
cudaMemcpy2D( d_phi2, pitch, &phi.Array[2][0][0], (size_t)(sx*sizeof(REAL)), sx*sizeof(REAL),sy, cudaMemcpyHostToDevice);
phi.CloseARRAY3D(); // close phi because it isnt needed
h_phi.declareArray2D(sx,sy); // open new 2D array for output
dim3 dimBlock(BLOCKSIZE, BLOCKSIZE);
dim3 dimGrid(sx/dimBlock.x, sy/dimBlock.y);
timeStep<<<dimGrid, dimBlock>>>(d_phi0, d_phi1, d_phi2, pitch, sx, sy, K.k); // this function checks the indexing, for now
cudaMemcpy2D( &h_phi.Array[0][0], (size_t)(sx*sizeof(REAL)), d_phi2, pitch, sx*sizeof(REAL), sy, cudaMemcpyDeviceToHost);
writeArray2TextFast("./output/output1.txt", h_phi.Array, sx, sy);