int dimU[numGrids];//These values are initialised elsewhere, but the array is included for completeness
double **d_u;
double **d_rhs;
double **d_res;
double **d_uMalloc;
double **d_rhsMalloc;
double **d_resMalloc;
size_t h_pitchBytes = dimension*sizeof(double);
size_t memcpyPitch;
cudaMalloc((void**) &d_u, numGrids*sizeof(double *));
cudaMalloc((void**) &d_res, numGrids*sizeof(double *));
cudaMalloc((void**) &d_rhs, numGrids*sizeof(double *));
d_uMalloc=(double**)malloc(numGrids*sizeof(double*));
d_rhsMalloc=(double**)malloc(numGrids*sizeof(double*));
d_resMalloc=(double**)malloc(numGrids*sizeof(double*));
for(i=0;i<numGrids;i++)
{
cudaMallocPitch((void**) &d_uMalloc[i], &d_pitchBytes, dimU[i]*sizeof(double), dimU[i]);
cudaMemcpy2D(d_uMalloc[i], memcpyPitch, h_u[i], h_pitchBytes,dimU[i]*sizeof(double), dimU[i], cudaMemcpyHostToDevice);
if(i==0){
memcpyPitch=d_pitchBytes;
printf("%i \n",memcpyPitch);
}
cudaMallocPitch((void**) &d_rhsMalloc[i], &d_pitchBytes, dimU[i]*sizeof(double), dimU[i]);
cudaMallocPitch((void**) &d_resMalloc[i], &d_pitchBytes, dimU[i]*sizeof(double), dimU[i]);
cudaMemcpy2D(d_rhsMalloc[i], memcpyPitch, h_rhs[i], h_pitchBytes,dimU[i]*sizeof(double), dimU[i], cudaMemcpyHostToDevice);
cudaMemcpy2D(d_resMalloc[i], memcpyPitch, h_res[i], h_pitchBytes,dimU[i]*sizeof(double), dimU[i], cudaMemcpyHostToDevice);
Pitch[i]=d_pitchBytes/sizeof(double);
}
printf("%s \n","Device Memory Allocation Completed");
cudaMemcpy(d_u,d_uMalloc,numGrids*sizeof(double *), cudaMemcpyHostToDevice);
cudaMemcpy(d_res,d_resMalloc,numGrids*sizeof(double *), cudaMemcpyHostToDevice);
cudaMemcpy(d_rhs,d_rhsMalloc,numGrids*sizeof(double *), cudaMemcpyHostToDevice);
Ok sorry, the day was long …