Different memory usage on GPU vs CPU for the same data set

Hi,

I am trying to use OpenACC to speed up my program. I have a data structure which contains a few 1d and 2d arrays. I used copyin clause to copy this data structure from CPU host to my GPU device. The memory usage for this data structure should be around 2 GB. But the memory usage on GPU exceeds 8 GB. I attached a short test code which contains my issue. I have tried GTX1070 and TESLA K80. Both systems gave me the same issue. Can you help me to figure out where the problem is? Thanks a lot!

#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <iostream>
#include <unistd.h>

#define GRID_SIZE_X 200
#define GRID_SIZE_Y 200
#define GRID_SIZE_ETA 64

struct Field {
     double *e_rk0;
     double *e_rk1;
     double *e_prev;
     double *rhob_rk0;
     double *rhob_rk1;
     double *rhob_prev;
     double **u_rk0;
     double **u_rk1;
     double **u_prev;
     double **dUsup;
     double **Wmunu_rk0;
     double **Wmunu_rk1;
     double **Wmunu_prev;
     double *pi_b_rk0;
     double *pi_b_rk1;
     double *pi_b_prev;
};

using namespace std;

// main program
int main(int argc, char *argv[]) {
    Field *hydro_fields = new Field;
    int n_cell = GRID_SIZE_ETA*(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1);
    hydro_fields->e_rk0 = new double[n_cell];
    hydro_fields->e_rk1 = new double[n_cell];
    hydro_fields->e_prev = new double[n_cell];
    hydro_fields->rhob_rk0 = new double[n_cell];
    hydro_fields->rhob_rk1 = new double[n_cell];
    hydro_fields->rhob_prev = new double[n_cell];
    hydro_fields->u_rk0 = new double* [n_cell];
    hydro_fields->u_rk1 = new double* [n_cell];
    hydro_fields->u_prev = new double* [n_cell];
    hydro_fields->dUsup = new double* [n_cell];
    hydro_fields->Wmunu_rk0 = new double* [n_cell];
    hydro_fields->Wmunu_rk1 = new double* [n_cell];
    hydro_fields->Wmunu_prev = new double* [n_cell];
    hydro_fields->pi_b_rk0 = new double[n_cell];
    hydro_fields->pi_b_rk1 = new double[n_cell];
    hydro_fields->pi_b_prev = new double[n_cell];
    for (int i = 0; i < n_cell; i++) {
        hydro_fields->e_rk0[i] = drand48();
        hydro_fields->e_rk1[i] = drand48();
        hydro_fields->e_prev[i] = drand48();
        hydro_fields->rhob_rk0[i] = drand48();
        hydro_fields->rhob_rk1[i] = drand48();
        hydro_fields->rhob_prev[i] = drand48();

        hydro_fields->u_rk0[i] = new double[4];
        hydro_fields->u_rk1[i] = new double[4];
        hydro_fields->u_prev[i] = new double[4];
        for (int j = 0; j < 4; j++) {
            hydro_fields->u_rk0[i][j] = drand48();
            hydro_fields->u_rk1[i][j] = drand48();
            hydro_fields->u_prev[i][j] = drand48();
        }
        hydro_fields->dUsup[i] = new double[20];
        for (int j = 0; j < 20; j++) {
            hydro_fields->dUsup[i][j] = drand48();
        }
        hydro_fields->Wmunu_rk0[i] = new double[14];
        hydro_fields->Wmunu_rk1[i] = new double[14];
        hydro_fields->Wmunu_prev[i] = new double[14];
        for (int j = 0; j < 14; j++) {
            hydro_fields->Wmunu_rk0[i][j] = drand48();
            hydro_fields->Wmunu_rk1[i][j] = drand48();
            hydro_fields->Wmunu_prev[i][j] = drand48();
        }
        hydro_fields->pi_b_rk0[i] = drand48();
        hydro_fields->pi_b_rk1[i] = drand48();
        hydro_fields->pi_b_prev[i] = drand48();
    }
    cout << "pre data copy" << endl;
    #pragma acc data copyin (hydro_fields[0:1],\
                         hydro_fields->e_rk0[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA],\
                         hydro_fields->e_prev[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA],\
                         hydro_fields->e_rk1[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA],\
                         hydro_fields->rhob_rk0[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA],\
                         hydro_fields->rhob_rk1[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA],\
                         hydro_fields->rhob_prev[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA], \
                         hydro_fields->u_rk0[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA][0:4], \
                         hydro_fields->u_rk1[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA][0:4], \
                         hydro_fields->u_prev[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA][0:4], \
                         hydro_fields->dUsup[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA][0:20], \
                         hydro_fields->Wmunu_rk0[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA][0:14], \
                         hydro_fields->Wmunu_rk1[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA][0:14], \
                         hydro_fields->Wmunu_prev[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA][0:14], \
                         hydro_fields->pi_b_rk0[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA], \
                         hydro_fields->pi_b_rk1[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA], \
                         hydro_fields->pi_b_prev[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA])
    {
        sleep(100000);
    }
    cout << "post data copy" << endl;
    return(0);
}  /* main */

Hi chun.shen,

Interesting code! Definitely a pathological case. The core problem is with the layout of your 2D arrays.

When you create 2D arrays with pointers to pointers, the compiler must create the same data structure on the device. Hence, it will first create the array of pointers and then in a loop, create the second dimension on the device and then “attach” the pointer to the first dimension. “attach” basically calls a kernel on the device to copy the pointer values to the first dimension.

Also, data transfer to the device can only be done on contiguous blocks of memory. Hence to copy the data for a 2D array, it must loop over the first dimension copying each individual row to the GPU.

In your case, you have 7 2D arrays of sizes n_cellsx4, n_cellsx14, and n_cellsx20. Given n_cells is 2,585,664, this means you have over 18 million of these “attach” calls and data transfers! This will have severe impact on your performance.

Also, the compiler runtime needs to keep track of all those pointer arrays (as an entry into the present table and other internal data structures). Usually it’s not an issue, but with 18+ million entries, that will add a lot of the extra memory usage.

I HIGHLY suggest you move n_cells to the second dimension. This will take care of both the performance problem as well as the extra memory usage.

For example:

 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/stat.h>
 #include <iostream>
 #include <unistd.h>

 #define GRID_SIZE_X 200
 #define GRID_SIZE_Y 200
 #define GRID_SIZE_ETA 64

 struct Field {
      double *e_rk0;
      double *e_rk1;
      double *e_prev;
      double *rhob_rk0;
      double *rhob_rk1;
      double *rhob_prev;
      double **u_rk0;
      double **u_rk1;
      double **u_prev;
      double **dUsup;
      double **Wmunu_rk0;
      double **Wmunu_rk1;
      double **Wmunu_prev;
      double *pi_b_rk0;
      double *pi_b_rk1;
      double *pi_b_prev;
 };

 using namespace std;

 // main program
 int main(int argc, char *argv[]) {
     Field *hydro_fields = new Field;
     int n_cell = GRID_SIZE_ETA*(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1);
     cout << "n_cell=" << n_cell << endl;
     hydro_fields->e_rk0 = new double[n_cell];
     hydro_fields->e_rk1 = new double[n_cell];
     hydro_fields->e_prev = new double[n_cell];
     hydro_fields->rhob_rk0 = new double[n_cell];
     hydro_fields->rhob_rk1 = new double[n_cell];
     hydro_fields->rhob_prev = new double[n_cell];
     hydro_fields->u_rk0 = new double* [4];
     hydro_fields->u_rk1 = new double* [4];
     hydro_fields->u_prev = new double* [4];
     hydro_fields->dUsup = new double* [20];
     hydro_fields->Wmunu_rk0 = new double* [14];
     hydro_fields->Wmunu_rk1 = new double* [14];
     hydro_fields->Wmunu_prev = new double* [14];
     hydro_fields->pi_b_rk0 = new double[n_cell];
     hydro_fields->pi_b_rk1 = new double[n_cell];
     hydro_fields->pi_b_prev = new double[n_cell];
     for (int j = 0; j < 4; j++) {
        hydro_fields->u_rk0[j] = new double[n_cell];
        hydro_fields->u_rk1[j] = new double[n_cell];
        hydro_fields->u_prev[j] = new double[n_cell];
        for (int i = 0; i < n_cell; i++) {
             hydro_fields->u_rk0[j][i] = drand48();
             hydro_fields->u_rk1[j][i] = drand48();
             hydro_fields->u_prev[j][i] = drand48();
        }
     }
     for (int j = 0; j < 20; j++) {
        hydro_fields->dUsup[j] = new double[n_cell];
        for (int i = 0; i < n_cell; i++) {
             hydro_fields->dUsup[j][i] = drand48();
        }
     }
     for (int j = 0; j < 14; j++) {
         hydro_fields->Wmunu_rk0[j] = new double[n_cell];
         hydro_fields->Wmunu_rk1[j] = new double[n_cell];
         hydro_fields->Wmunu_prev[j] = new double[n_cell];
         for (int i = 0; i < n_cell; i++) {
             hydro_fields->Wmunu_rk0[j][i] = drand48();
             hydro_fields->Wmunu_rk1[j][i] = drand48();
             hydro_fields->Wmunu_prev[j][i] = drand48();
         }
     }
     for (int i = 0; i < n_cell; i++) {
         hydro_fields->e_rk0[i] = drand48();
         hydro_fields->e_rk1[i] = drand48();
         hydro_fields->e_prev[i] = drand48();
         hydro_fields->rhob_rk0[i] = drand48();
         hydro_fields->rhob_rk1[i] = drand48();
         hydro_fields->rhob_prev[i] = drand48();
         hydro_fields->pi_b_rk0[i] = drand48();
         hydro_fields->pi_b_rk1[i] = drand48();
         hydro_fields->pi_b_prev[i] = drand48();
     }
     cout << "pre data copy" << endl;
     #pragma acc data create (hydro_fields[0:1], \
                          hydro_fields->e_rk0[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA], \
                          hydro_fields->e_prev[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA], \
                          hydro_fields->e_rk1[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA],\
                          hydro_fields->rhob_rk0[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA],\
                          hydro_fields->rhob_rk1[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA],\
                          hydro_fields->rhob_prev[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA], \
                          hydro_fields->u_rk0[0:4][0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA], \
                          hydro_fields->u_rk1[0:4][0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA], \
                          hydro_fields->u_prev[0:4][0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA], \
                          hydro_fields->dUsup[0:20][0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA], \
                          hydro_fields->Wmunu_rk0[0:14][0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA], \
                          hydro_fields->Wmunu_rk1[0:14][0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA], \
                          hydro_fields->Wmunu_prev[0:14][0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA], \
                          hydro_fields->pi_b_rk0[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA], \
                          hydro_fields->pi_b_rk1[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA], \
                          hydro_fields->pi_b_prev[0:(GRID_SIZE_X + 1)*(GRID_SIZE_Y + 1)*GRID_SIZE_ETA])
     {
     cout << "Before Sleep" << endl;
         sleep(100000);
     }
     cout << "post data copy" << endl;
     return(0);
 }  /* main */

Hi mkcolg,

Thank you very much for your detailed explanation and suggestion! It works!

Chun