Copyin size variable vectors to device using pragmas

Hello,
is it possibile to do something like the code below?

//Class members
   int *cols_in, *cols_out;
	Edge **restrict d_in_attr;
	Edge **restrict d_out_attr;
	node_id **restrict d_in;
	node_id **restrict d_out;

   d_attr = (Node*) malloc(n * sizeof(Node));
	d_in_attr = (Edge**) malloc(sizeof(Edge*) * n);
	d_out_attr = (Edge**) malloc(sizeof(Edge*) * n);
	d_in = (node_id**) malloc(sizeof(node_id*) * n);
	d_out = (node_id**) malloc(sizeof(node_id*) * n);

#pragma acc enter data copyin(this[0:1]) create(d_attr[0:n])
	for (int i = 0; i < n; i++) {
		int in = cols_in[i];
		int out = cols_out[i];

		d_in_attr[i] = (Edge*) malloc(sizeof(Edge) * in);
		#pragma acc enter data create(d_in_attr[i:i+1][0:in])

		d_in[i] = (node_id*) malloc(sizeof(node_id) * in);
		#pragma acc enter data create(d_in[i:i+1][0:in])

		d_out_attr[i] = (Edge*) malloc(sizeof(Edge) * out);
		#pragma acc enter data create(d_out_attr[i:i+1][0:out])

		d_out[i] = (node_id*) malloc(sizeof(node_id) * out);
		#pragma acc enter data create(d_out[i:i+1][0:out])
	}

My goal is to copy in the memory device these variables, but I get an error. Can you explain me why?

N.B.: the vectors are size variable

Hi khrishino,

Without a complete reproducer nor a description of the error you’re getting it’s hard to tell the exact problem. However in looking at the code snipit, the problem may be because you need to create the array of pointers for the multi-dimensional arrays. Also you’ll want to use “i” instead of “i:i+1” when you create the arrays within the loop.

Try something like:

    #pragma acc enter data copyin(this[0:1]) create(d_attr[0:n],d_in_attr[0:n], d_out_attr[0:n],d_in[0:n], d_out[0:n])
    for (int i = 0; i < n; i++) {
       int in = cols_in[i];
       int out = cols_out[i];

       d_in_attr[i] = (Edge*) malloc(sizeof(Edge) * in);
       #pragma acc enter data create(d_in_attr[i][0:in])

       d_in[i] = (node_id*) malloc(sizeof(node_id) * in);
       #pragma acc enter data create(d_in[i][0:in])

       d_out_attr[i] = (Edge*) malloc(sizeof(Edge) * out);
       #pragma acc enter data create(d_out_attr[i][0:out])

       d_out[i] = (node_id*) malloc(sizeof(node_id) * out);
       #pragma acc enter data create(d_out[i][0:out])
    }

Hope this helps,
Mat

Hello Mat,
sorry!!! This was the previous error:

(null) lives at 0x36df850 size 4 not present
Present table dump for device[1]: NVIDIA Tesla GPU 0, compute capability 5.0, threadid=1
host:(nil) device:(nil) size:0 presentcount:0+1 line:788 name:(null)
host:(nil) device:(nil) size:0 presentcount:0+1 line:791 name:(null)
host:0x1773c90 device:0x703940000 size:192 presentcount:0+1 line:777 name:_T23888824_12752
host:0x1774470 device:0x703940200 size:20 presentcount:0+1 line:777 name:(null)
host:0x17744b0 device:0x703940400 size:8 presentcount:0+1 line:777 name:(null)
host:0x17744e0 device:0x703940800 size:8 presentcount:0+1 line:777 name:(null)
host:0x1774510 device:0x703940600 size:8 presentcount:0+1 line:777 name:(null)
host:0x1774540 device:0x703940a00 size:8 presentcount:0+1 line:777 name:(null)
host:0x36d6d70 device:0x703940c00 size:4 presentcount:0+1 line:788 name:(null)
host:0x36d8c20 device:0x703940e00 size:2 presentcount:0+1 line:791 name:(null)
allocated block device:0x703940000 size:512 thread:1
allocated block device:0x703940200 size:512 thread:1
allocated block device:0x703940400 size:512 thread:1
allocated block device:0x703940600 size:512 thread:1
allocated block device:0x703940800 size:512 thread:1
allocated block device:0x703940a00 size:512 thread:1
allocated block device:0x703940c00 size:512 thread:1
allocated block device:0x703940e00 size:512 thread:1
FATAL ERROR: variable in data clause is partially present on the device: name=(unknown)
 file:/home/christian/eclipse-workspace/ProvaParallelVF3/src/../include/argraph.hpp _ZN7ARGraphIiiE12moveToDeviceEv line:794

Now, if I try your solution, I get this error

PGCC-S-0000-Internal compiler error. pragma: bad ilmopc     308 (src/main.cpp: 788)
PGCC-S-0000-Internal compiler error. pragma: bad ilmopc     308 (src/main.cpp: 791)
PGCC-S-0000-Internal compiler error. pragma: bad ilmopc     308 (src/main.cpp: 794)
PGCC-S-0000-Internal compiler error. pragma: bad ilmopc     308 (src/main.cpp: 796)
pgc++-Fatal-/opt/pgi/linux86-64/17.10/bin/pggpp2 TERMINATED by signal 11
Arguments to /opt/pgi/linux86-64/17.10/bin/pggpp2
/opt/pgi/linux86-64/17.10/bin/pggpp2 src/main.cpp -debug -x 120 0x200 -opt 2 -terse 1 -inform severe -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 34 0x8 -x 32 6291456 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 39 0x80 -x 59 4 -x 129 2 -tp haswell -astype 0 -x 121 1 -fn src/main.cpp -il /tmp/pgc++ttKc5OjPVc4j.il/main.il -inlib /tmp/pgc++7tKcXsVryijH.ext -insize 600 -x 221 25000 -x 222 5 -x 14 32 -x 117 0x600 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=50400 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 120 0x200000 -x 70 0x40000000 -x 164 0x800000 -accel tesla -x 180 0x4000400 -x 121 0xc00 -x 163 0x1 -x 186 0x80000 -cudaver 9.0 -x 194 0x40000 -x 176 0x100 -cudacap 30 -cudacap 35 -cudacap 50 -cudacap 60 -cudacap 70 -x 186 0x80000 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 189 0x8000 -y 163 0xc0000000 -x 192 0x40000000 -x 189 0x10 -y 189 0x4000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 9 1 -x 72 0x1 -x 136 0x11 -quad -x 119 0x10000000 -x 129 0x40000000 -x 164 0x1000 -gnuvsn 50400 -x 69 0x200 -x 119 0x08 -cmdline '+pgc++ src/main.cpp -std=c++11 -ta=tesla:cuda9.0,time -acc -I/usr/local/cuda-9.1/include -Minfo=accel -Minline -O2 -Mvect=sse -Mcache_align -Mpre -w -g -o main_nvidia' -asm /tmp/pgc++RtKcbW2hVjJx.s
Makefile:24: set di istruzioni per l'obiettivo "nvidia" non riuscito
make: *** [nvidia] Errore 127

Hi khrishino,

My best guess is that the partially present error is due to the “[i:i+1]” since the second time through the loop, the current “i” was already partially created on the device.

The internal compiler error is of course a compiler error where it is generating a bad intermediate language op code. In looking through our bug reports, I do see a similar issue, but if you could send a reproducing example to PGI Customer Service (trs@pgroup.com), I’ll confirm that it’s the same problem and add your issue to this report.

Let’s try another route:

    for (int i = 0; i < n; i++) { 
       int in = cols_in[i]; 
       int out = cols_out[i]; 

       d_in_attr[i] = (Edge*) malloc(sizeof(Edge) * in); 
       d_in[i] = (node_id*) malloc(sizeof(node_id) * in); 
       d_out_attr[i] = (Edge*) malloc(sizeof(Edge) * out); 
       d_out[i] = (node_id*) malloc(sizeof(node_id) * out);     
   } 

    #pragma acc enter data copyin(this[0:0])  \
       create(d_attr[0:n], d_in_attr[0:n][0:in],   \
        d_out_attr[0:n][0:out],d_in[0:n][0:in], d_out[0:n][0:out])

You may also want to try using CUDA Unified Memory instead (-ta=tesla:managed). Especially with C++ where the data structures can be complex and deeply nested, having the CUDA runtime manage the data rather than doing a manual deep copy in OpenACC is often easier.

The caveat being that only dynamically allocated data is managed, hence if your class object is static (i.e. declared like “classname A” versus “classname *A”), then the “this” pointer still needs to be manually created on the device. The class members would be fine since they are malloc’d. Just be sure to update “this” if the member’s address change.

For example:

    for (int i = 0; i < n; i++) { 
       int in = cols_in[i]; 
       int out = cols_out[i]; 

       d_in_attr[i] = (Edge*) malloc(sizeof(Edge) * in); 
       d_in[i] = (node_id*) malloc(sizeof(node_id) * in); 
       d_out_attr[i] = (Edge*) malloc(sizeof(Edge) * out); 
       d_out[i] = (node_id*) malloc(sizeof(node_id) * out);     
   } 

    #pragma acc enter data copyin(this[0:0])

or something like:

    
#pragma acc enter data copyin(this[0:0])

... later 

for (int i = 0; i < n; i++) { 
       int in = cols_in[i]; 
       int out = cols_out[i]; 

       d_in_attr[i] = (Edge*) malloc(sizeof(Edge) * in); 
       d_in[i] = (node_id*) malloc(sizeof(node_id) * in); 
       d_out_attr[i] = (Edge*) malloc(sizeof(Edge) * out); 
       d_out[i] = (node_id*) malloc(sizeof(node_id) * out);     
   } 
    
#pragma acc update device(this)

-Mat