I have a large cunk of data, ~4 GB, that fits into host memory, but is too big for device shared memory. Basically I partitioned my space into z slices such that each stream input is about 3 kB. For each kernel, it will read 3 of these 3 kB streams and do computation and update one of the streams. This is how my code is manageing the memory. I want to know if this is the best way to do it or is their a better way of doing memory management.
__host__ void update(float3 **e_, float3 **h_, float3 *e_cur_d, float3 *e_cur_h, float3 *h_cur_d, float3 *h_cur_h, float3 *h_prev_d, float3 *h_prev_h){
int timestep = 0;
int max_timesteps = 10000;
int k;
int j;
int i;
for(timestep = 0; timestep < max_timesteps; timestep++){
h_[dim_z/2 ][(dim_x*dim_y)/2 ].x = 5.0;
for(k = 1; k < dim_z; k++){
for(i = 0; i < dim_x * dim_y; i++){
e_cur_h[i].x = e_[k][i].x;
e_cur_h[i].y = e_[k][i].y;
e_cur_h[i].z = e_[k][i].z;
h_cur_h[i].x = h_[k][i].x;
h_cur_h[i].y = h_[k][i].y;
h_cur_h[i].z = h_[k][i].z;
h_prev_h[i].x = h_[k-1][i].x;
h_prev_h[i].y = h_[k-1][i].y;
h_prev_h[i].z = h_[k-1][i].z;
}
cudaMemcpy((void*)e_cur_d, e_cur_h, dim_x*dim_y*sizeof(float3), cudaMemcpyHostToDevice);
cudaMemcpy((void*)h_cur_d, h_cur_h, dim_x*dim_y*sizeof(float3), cudaMemcpyHostToDevice);
cudaMemcpy((void*)h_prev_d, h_prev_h, dim_x*dim_y*sizeof(float3), cudaMemcpyHostToDevice);
eUpdateSlice<<<100,256>>>(e_cur_d, h_cur_d, h_prev_d, dim_x, dim_y,0,0);
}
cudaMemcpy((void*)e_cur_h, e_cur_d, dim_x*dim_y*sizeof(float3), cudaMemcpyDeviceToHost);
cudaMemcpyDeviceToDevice);
for(i = 0; i < 9; i++){
e_[k][i].x = e_cur_h[i].x;
e_[k][i].y = e_cur_h[i].y;
e_[k][i].z = e_cur_h[i].z;
}