OK Matt - thanks for the tip. The example I am looking at does way more compute so it makes sense to spatially partition across GPU’s - the example I posted is a overly simplified version I created to try and understand what is causing the run time error. I was going to look into MPI+OpenACC for multi-GPU programming next but I thought doing it as below would be straightforward :-)
I am keen to know what is causing the issue as I have spent a bit of time on it. I have used the PGI_ACC_DEBUG=1. It spews out a lot of Debug info – can you detect an obvious issue from the below?
proc()
pgi_uacc_set_device_num(devnum=0,devtype=4,threadid=1)
pgi_uacc_set_device_num(devnum=0,devtype=4,threadid=1) cuda devid=1 dindex=1
pgi_uacc_set_device_num(devnum=0,devtype=4,threadid=1)
pgi_uacc_set_device_num(devnum=0,devtype=4,threadid=1) cuda devid=1 dindex=1
pgi_uacc_dataenterstart( file=/home/xavierportell/mdev.c, function=proc, line=72:137, line=80, devid=0 )
pgi_uacc_dataon(hostptr=0x7ffd85695550,stride=1,size=2,eltsize=4,lineno=80,name=lm1,flags=0x700=present+create+copyin,async=-1,threadid=1)
pgi_uacc_alloc(size=8,devid=1,threadid=1)
allocate device memory 0x7ffa26400000(512B)
pgi_uacc_alloc(size=8,devid=1,threadid=1) returns 0x7ffa26400000
map dev:0x7ffa26400000 host:0x7ffd85695550 dindex:1 size:8 offset:0 (line:80 name:lm1) thread:1
alloc done with devptr at 0x7ffa26400000
pgi_uacc_dataupx(devptr=0x7ffa26400000,hostptr=0x7ffd85695550,stride=1,size=2,eltsize=4,lineno=80,name=lm1,async=-1,threadid=1)
pgi_uacc_cuda_dataup1(devdst=0x7ffa26400000,hostsrc=0x7ffd85695550,offset=0,stride=1,size=2,eltsize=4,lineno=80,name=lm1,thread=1)
pgi_uacc_dataon(hostptr=0x7ffd85695558,stride=1,size=2,eltsize=4,lineno=80,name=lm2,flags=0x700=present+create+copyin,async=-1,threadid=1)
pgi_uacc_alloc(size=8,devid=1,threadid=1)
allocate device memory 0x7ffa26400200(512B)
pgi_uacc_alloc(size=8,devid=1,threadid=1) returns 0x7ffa26400200
map dev:0x7ffa26400200 host:0x7ffd85695558 dindex:1 size:8 offset:0 (line:80 name:lm2) thread:1
alloc done with devptr at 0x7ffa26400200
pgi_uacc_dataupx(devptr=0x7ffa26400200,hostptr=0x7ffd85695558,stride=1,size=2,eltsize=4,lineno=80,name=lm2,async=-1,threadid=1)
pgi_uacc_cuda_dataup1(devdst=0x7ffa26400200,hostsrc=0x7ffd85695558,offset=0,stride=1,size=2,eltsize=4,lineno=80,name=lm2,thread=1)
pgi_uacc_dataon(hostptr=0x397d760,stride=1,size=1048576,extent=-1,eltsize=4,lineno=80,name=pV1,flags=0xb00=present+create+copyout,async=-1,threadid=1)
pgi_uacc_alloc(size=4194304,devid=1,threadid=1)
allocate device memory 0x7ffa26000000(4194304B)
pgi_uacc_alloc(size=4194304,devid=1,threadid=1) returns 0x7ffa26000000
map dev:0x7ffa26000000 host:0x397d760 dindex:1 size:4194304 offset:0 (line:80 name:pV1) thread:1
alloc done with devptr at 0x7ffa26000000
pgi_uacc_dataon(hostptr=0x3d7d770,stride=1,size=1048576,extent=-1,eltsize=4,lineno=80,name=pV2,flags=0xb00=present+create+copyout,async=-1,threadid=1)
pgi_uacc_alloc(size=4194304,devid=1,threadid=1)
allocate device memory 0x7ffa25c00000(4194304B)
pgi_uacc_alloc(size=4194304,devid=1,threadid=1) returns 0x7ffa25c00000
map dev:0x7ffa25c00000 host:0x3d7d770 dindex:1 size:4194304 offset:0 (line:80 name:pV2) thread:1
alloc done with devptr at 0x7ffa25c00000
pgi_uacc_dataon(hostptr=0x397d760,stride=1,size=1048576,extent=-1,eltsize=4,lineno=80,name=pV1,flags=0xb00=present+create+copyout,async=-1,threadid=1)
pgi_uacc_dataon(hostptr=0x3d7d770,stride=1,size=1048576,extent=-1,eltsize=4,lineno=80,name=pV2,flags=0xb00=present+create+copyout,async=-1,threadid=1)
pgi_uacc_dataenterdone( devid=1 )
pgi_uacc_cuda_wait(lineno=-99,async=-1,dindex=1)
pgi_uacc_cuda_wait(sync on stream=0x117dd70)
pgi_uacc_cuda_wait done
pgi_uacc_enter( devid=0 )
Thread 1 loading module onto device 0
pgi_uacc_enter(objinfo=0x6098a0)
0x11235813 = magic
1 = numplatforms
Platform 0 = 0x609860
17 = id NVIDIA CUDA 0x00000005 = flags global objptr
3 = numversions
Version 0 = 0x609740
0x33550336 = magic
0x00430104 = flags sm30 sm35 0x03c30104 = pflags sm30 sm35 sm50 sm60 sm70
2 = numfunctions
1 = numbinaries
Function 0 = 0x609580 Function 0 = 0x609580 = proc_82_gpu
82 = lineno
128x1x1 = block size
1x1x1 = grid size
0x0 = config_flag =
1x1x1 = unroll
0 = shared memory
0 = reduction shared memory
0 = reduction arg
0 = reduction bytes
48 = argument bytes
0 = max argument bytes
0 = size arguments
Function 1 = 0x609660 Function 1 = 0x609660 = proc_103_gpu
103 = lineno
128x1x1 = block size
1x1x1 = grid size
0x0 = config_flag =
1x1x1 = unroll
0 = shared memory
0 = reduction shared memory
0 = reduction arg
0 = reduction bytes
40 = argument bytes
0 = max argument bytes
0 = size arguments
Binary 0 = 0x609520
0x00000bbd = binaryid
11424 = binarylen
(nil) = binary
Version 1 = 0x6097a0
0x33550336 = magic
0x00810104 = flags sm50 0x03810104 = pflags sm50 sm60 sm70
2 = numfunctions
1 = numbinaries
Function 0 = 0x609580 Function 0 = 0x609580 = proc_82_gpu
82 = lineno
128x1x1 = block size
1x1x1 = grid size
0x0 = config_flag =
1x1x1 = unroll
0 = shared memory
0 = reduction shared memory
0 = reduction arg
0 = reduction bytes
48 = argument bytes
0 = max argument bytes
0 = size arguments
Function 1 = 0x609660 Function 1 = 0x609660 = proc_103_gpu
103 = lineno
128x1x1 = block size
1x1x1 = grid size
0x0 = config_flag =
1x1x1 = unroll
0 = shared memory
0 = reduction shared memory
0 = reduction arg
0 = reduction bytes
40 = argument bytes
0 = max argument bytes
0 = size arguments
Binary 0 = 0x609540
0x00001388 = binaryid
11424 = binarylen
(nil) = binary
Version 2 = 0x609800
0x33550336 = magic
0x01010104 = flags sm60 0x03010104 = pflags sm60 sm70
2 = numfunctions
1 = numbinaries
Function 0 = 0x609580 Function 0 = 0x609580 = proc_82_gpu
82 = lineno
128x1x1 = block size
1x1x1 = grid size
0x0 = config_flag =
1x1x1 = unroll
0 = shared memory
0 = reduction shared memory
0 = reduction arg
0 = reduction bytes
48 = argument bytes
0 = max argument bytes
0 = size arguments
Function 1 = 0x609660 Function 1 = 0x609660 = proc_103_gpu
103 = lineno
128x1x1 = block size
1x1x1 = grid size
0x0 = config_flag =
1x1x1 = unroll
0 = shared memory
0 = reduction shared memory
0 = reduction arg
0 = reduction bytes
40 = argument bytes
0 = max argument bytes
0 = size arguments
Binary 0 = 0x609560
0x00001770 = binaryid
11424 = binarylen
(nil) = binary
pgi_uacc_computestart( file=/home/xavierportell/mdev.c, function=proc, line=72:137, line=82, devid=0, computeconstruct=9999 )
pgi_uacc_launch funcnum=0 argptr=0x7ffd85695720 sizeargs=(nil) async=-1 devid=1
Arguments to function 0 proc_82_gpu dindex=1 threadid=1 device=0:
1048576 0 1048576 0 637534208 32762 641728512 32762
633339904 32762 641729024 32762 0 0 0 0
0x00100000 0x00000000 0x00100000 0x00000000 0x26000000 0x00007ffa 0x26400000 0x00007ffa
0x25c00000 0x00007ffa 0x26400200 0x00007ffa 0x00000000 0x00000000 0x00000000 0x00000000
Launch configuration for function=0=proc_82_gpu line=82 dindex=1 threadid=1 device=0 <<<(1,1,1),(128,1,1),0>>> async=-1
pgi_uacc_computedone( devid=0, computeconstruct=9999 )
pgi_uacc_cuda_wait(lineno=-99,async=-1,dindex=1)
pgi_uacc_cuda_wait(sync on stream=0x117dd70)
pgi_uacc_cuda_wait done
pgi_uacc_dataexitstart( file=/home/xavierportell/mdev.c, function=proc, line=72:137, line=80, devid=0 )
pgi_uacc_dataoff(devptr=0x7ffa26400000,hostptr=0x7ffd85695550,stride=1,size=2,eltsize=4,lineno=88,name=lm1,flags=0x700=present+create+copyin,async=-1,threadid=1)
pgi_uacc_free(ptr=0x7ffa26400000,devid=1,threadid=1)
pgi_uacc_dataoff(devptr=0x7ffa26400200,hostptr=0x7ffd85695558,stride=1,size=2,eltsize=4,lineno=88,name=lm2,flags=0x700=present+create+copyin,async=-1,threadid=1)
pgi_uacc_free(ptr=0x7ffa26400200,devid=1,threadid=1)
pgi_uacc_dataoff(devptr=0x7ffa26000000,hostptr=0x397d760,stride=1,size=1048576,extent=-1,eltsize=4,lineno=88,name=pV1,flags=0xb00=present+create+copyout,async=-1,threadid=1)
pgi_uacc_dataoff(devptr=0x7ffa25c00000,hostptr=0x3d7d770,stride=1,size=1048576,extent=-1,eltsize=4,lineno=88,name=pV2,flags=0xb00=present+create+copyout,async=-1,threadid=1)
pgi_uacc_dataoff(devptr=0x7ffa26000000,hostptr=0x397d760,stride=1,size=1048576,extent=-1,eltsize=4,lineno=88,name=pV1,flags=0xb00=present+create+copyout,async=-1,threadid=1)
pgi_uacc_datadownx(devptr=0x7ffa26000000,hostptr=0x397d760,stride=1,size=1048576,extent=-1,eltsize=4,lineno=88,name=pV1,async=-1,threadid=1)
pgi_uacc_cuda_datadown1(devdst=0x7ffa26000000,hostsrc=0x397d760,offset=0,stride=1,size=1048576,eltsize=4,lineno=88,name=pV1,async=-1,dindex=1)
pgi_uacc_free(ptr=0x7ffa26000000,devid=1,threadid=1)
pgi_uacc_dataoff(devptr=0x7ffa25c00000,hostptr=0x3d7d770,stride=1,size=1048576,extent=-1,eltsize=4,lineno=88,name=pV2,flags=0xb00=present+create+copyout,async=-1,threadid=1)
pgi_uacc_datadownx(devptr=0x7ffa25c00000,hostptr=0x3d7d770,stride=1,size=1048576,extent=-1,eltsize=4,lineno=88,name=pV2,async=-1,threadid=1)
pgi_uacc_cuda_datadown1(devdst=0x7ffa25c00000,hostsrc=0x3d7d770,offset=0,stride=1,size=1048576,eltsize=4,lineno=88,name=pV2,async=-1,dindex=1)
pgi_uacc_free(ptr=0x7ffa25c00000,devid=1,threadid=1)
pgi_uacc_dataexitdone( devid=1 )
pgi_uacc_cuda_wait(lineno=-99,async=-1,dindex=1)
pgi_uacc_cuda_wait(sync on stream=0x117dd70)
pgi_uacc_cuda_wait done
dt: 0.044673
pgi_uacc_set_device_num(devnum=0,devtype=4,threadid=1)
pgi_uacc_set_device_num(devnum=0,devtype=4,threadid=1) cuda devid=1 dindex=1
pgi_uacc_set_device_num(devnum=1,devtype=4,threadid=1)
cuda_init_device thread:1 data.default_device_num:1 pdata.cuda.default_device_num:1
cuda_init_device(threadid=1, device 1) dindex=2, api_context=(nil)
cuda_init_device(threadid=1, device 1) dindex=2, setting api_context=(nil)
cuda_init_device(threadid=1, device 1) dindex=2, new api_context=0xd060c0
argument memory for queue 16 device:0x7ffa0d000000 host:0x7ffa0ce00000
pgi_uacc_set_device_num(devnum=1,devtype=4,threadid=1) cuda devid=2 dindex=2
pgi_uacc_dataenterstart( file=/home/xavierportell/mdev.c, function=proc, line=72:137, line=92, devid=0 )
pgi_uacc_dataon(hostptr=0x357d750,stride=1,size=1048576,extent=-1,eltsize=4,lineno=92,name=pV0,flags=0x20300=present+create+dynamic,async=-1,threadid=1)
pgi_uacc_alloc(size=4194304,devid=2,threadid=1)
allocate device memory 0x7ffa0ca00000(4194304B)
pgi_uacc_alloc(size=4194304,devid=2,threadid=1) returns 0x7ffa0ca00000
map dev:0x7ffa0ca00000 host:0x357d750 dindex:2 size:4194304 offset:0 (line:92 name:pV0) thread:1
alloc done with devptr at 0x7ffa0ca00000
pgi_uacc_dataon(hostptr=0x397d760,stride=1,size=1048576,extent=-1,eltsize=4,lineno=92,name=pV1,flags=0x20700=present+create+copyin+dynamic,async=-1,threadid=1)
pgi_uacc_alloc(size=4194304,devid=2,threadid=1)
allocate device memory 0x7ffa0c600000(4194304B)
pgi_uacc_alloc(size=4194304,devid=2,threadid=1) returns 0x7ffa0c600000
map dev:0x7ffa0c600000 host:0x397d760 dindex:2 size:4194304 offset:0 (line:92 name:pV1) thread:1
alloc done with devptr at 0x7ffa0c600000
pgi_uacc_dataupx(devptr=0x7ffa0c600000,hostptr=0x397d760,stride=1,size=1048576,extent=-1,eltsize=4,lineno=92,name=pV1,async=-1,threadid=1)
pgi_uacc_cuda_dataup1(devdst=0x7ffa0c600000,hostsrc=0x397d760,offset=0,stride=1,size=1048576,eltsize=4,lineno=92,name=pV1,thread=1)
pgi_uacc_dataon(hostptr=0x3d7d770,stride=1,size=1048576,extent=-1,eltsize=4,lineno=92,name=pV2,flags=0x20700=present+create+copyin+dynamic,async=-1,threadid=1)
pgi_uacc_alloc(size=4194304,devid=2,threadid=1)
allocate device memory 0x7ffa08200000(4194304B)
pgi_uacc_alloc(size=4194304,devid=2,threadid=1) returns 0x7ffa08200000
map dev:0x7ffa08200000 host:0x3d7d770 dindex:2 size:4194304 offset:0 (line:92 name:pV2) thread:1
alloc done with devptr at 0x7ffa08200000
pgi_uacc_dataupx(devptr=0x7ffa08200000,hostptr=0x3d7d770,stride=1,size=1048576,extent=-1,eltsize=4,lineno=92,name=pV2,async=-1,threadid=1)
pgi_uacc_cuda_dataup1(devdst=0x7ffa08200000,hostsrc=0x3d7d770,offset=0,stride=1,size=1048576,eltsize=4,lineno=92,name=pV2,thread=1)
pgi_uacc_dataon(hostptr=0x7ffd856955a0,stride=1,size=2,eltsize=4,lineno=92,name=k,flags=0x20700=present+create+copyin+dynamic,async=-1,threadid=1)
pgi_uacc_alloc(size=8,devid=2,threadid=1)
allocate device memory 0x7ffa08000000(512B)
pgi_uacc_alloc(size=8,devid=2,threadid=1) returns 0x7ffa08000000
map dev:0x7ffa08000000 host:0x7ffd856955a0 dindex:2 size:8 offset:0 (line:92 name:k) thread:1
alloc done with devptr at 0x7ffa08000000
pgi_uacc_dataupx(devptr=0x7ffa08000000,hostptr=0x7ffd856955a0,stride=1,size=2,eltsize=4,lineno=92,name=k,async=-1,threadid=1)
pgi_uacc_cuda_dataup1(devdst=0x7ffa08000000,hostsrc=0x7ffd856955a0,offset=0,stride=1,size=2,eltsize=4,lineno=92,name=k,thread=1)
pgi_uacc_dataenterdone( devid=2 )
pgi_uacc_cuda_wait(lineno=-99,async=-1,dindex=2)
pgi_uacc_cuda_wait(sync on stream=0x70f3650)
pgi_uacc_cuda_wait done
dt: 0.044673 0.28501
pgi_uacc_set_device_num(devnum=0,devtype=4,threadid=1)
pgi_uacc_set_device_num(devnum=0,devtype=4,threadid=1) cuda devid=1 dindex=1
pgi_uacc_set_device_num(devnum=0,devtype=4,threadid=1)
pgi_uacc_set_device_num(devnum=0,devtype=4,threadid=1) cuda devid=1 dindex=1
pgi_uacc_dataenterstart( file=/home/xavierportell/mdev.c, function=proc, line=72:137, line=101, devid=0 )
pgi_uacc_dataon(hostptr=0x417d780,stride=1,size=1048576,extent=-1,eltsize=4,lineno=101,name=pV3,flags=0x20300=present+create+dynamic,async=-1,threadid=1)
pgi_uacc_alloc(size=4194304,devid=1,threadid=1)
recycle device memory 0x7ffa26000000(4194304B)
pgi_uacc_alloc(size=4194304,devid=1,threadid=1) returns 0x7ffa26000000
map dev:0x7ffa26000000 host:0x417d780 dindex:1 size:4194304 offset:0 (line:101 name:pV3) thread:1
alloc done with devptr at 0x7ffa26000000
pgi_uacc_dataon(hostptr=0x397d760,stride=1,size=1048576,extent=-1,eltsize=4,lineno=101,name=pV1,flags=0x20700=present+create+copyin+dynamic,async=-1,threadid=1)
pgi_uacc_alloc(size=4194304,devid=1,threadid=1)
recycle device memory 0x7ffa25c00000(4194304B)
pgi_uacc_alloc(size=4194304,devid=1,threadid=1) returns 0x7ffa25c00000
map dev:0x7ffa25c00000 host:0x397d760 dindex:1 size:4194304 offset:0 (line:101 name:pV1) thread:1
alloc done with devptr at 0x7ffa25c00000
pgi_uacc_dataupx(devptr=0x7ffa25c00000,hostptr=0x397d760,stride=1,size=1048576,extent=-1,eltsize=4,lineno=101,name=pV1,async=-1,threadid=1)
pgi_uacc_cuda_dataup1(devdst=0x7ffa25c00000,hostsrc=0x397d760,offset=0,stride=1,size=1048576,eltsize=4,lineno=101,name=pV1,thread=1)
pgi_uacc_dataon(hostptr=0x3d7d770,stride=1,size=1048576,extent=-1,eltsize=4,lineno=101,name=pV2,flags=0x20700=present+create+copyin+dynamic,async=-1,threadid=1)
pgi_uacc_alloc(size=4194304,devid=1,threadid=1)
allocate device memory 0x7ffa07c00000(4194304B)
pgi_uacc_alloc(size=4194304,devid=1,threadid=1) returns 0x7ffa07c00000
map dev:0x7ffa07c00000 host:0x3d7d770 dindex:1 size:4194304 offset:0 (line:101 name:pV2) thread:1
alloc done with devptr at 0x7ffa07c00000
pgi_uacc_dataupx(devptr=0x7ffa07c00000,hostptr=0x3d7d770,stride=1,size=1048576,extent=-1,eltsize=4,lineno=101,name=pV2,async=-1,threadid=1)
pgi_uacc_cuda_dataup1(devdst=0x7ffa07c00000,hostsrc=0x3d7d770,offset=0,stride=1,size=1048576,eltsize=4,lineno=101,name=pV2,thread=1)
pgi_uacc_dataon(hostptr=0x7ffd856955a0,stride=1,size=2,eltsize=4,lineno=101,name=k,flags=0x20700=present+create+copyin+dynamic,async=-1,threadid=1)
pgi_uacc_alloc(size=8,devid=1,threadid=1)
recycle device memory 0x7ffa26400000(512B)
pgi_uacc_alloc(size=8,devid=1,threadid=1) returns 0x7ffa26400000
map dev:0x7ffa26400000 host:0x7ffd856955a0 dindex:1 size:8 offset:0 (line:101 name:k) thread:1
alloc done with devptr at 0x7ffa26400000
pgi_uacc_dataupx(devptr=0x7ffa26400000,hostptr=0x7ffd856955a0,stride=1,size=2,eltsize=4,lineno=101,name=k,async=-1,threadid=1)
pgi_uacc_cuda_dataup1(devdst=0x7ffa26400000,hostsrc=0x7ffd856955a0,offset=0,stride=1,size=2,eltsize=4,lineno=101,name=k,thread=1)
pgi_uacc_dataenterdone( devid=1 )
pgi_uacc_cuda_wait(lineno=-99,async=-1,dindex=1)
pgi_uacc_cuda_wait(sync on stream=0x117dd70)
pgi_uacc_cuda_wait done
pgi_uacc_enter( devid=0 )
pgi_uacc_dataenterstart( file=/home/xavierportell/mdev.c, function=proc, line=72:137, line=103, devid=0 )
pgi_uacc_dataon(hostptr=0x7ffd856955a0,stride=1,size=2,eltsize=4,lineno=103,name=k,flags=0xf00=present+create+copyin+copyout,async=-4,threadid=1)
pgi_uacc_dataenterdone( devid=1 )
pgi_uacc_computestart( file=/home/xavierportell/mdev.c, function=proc, line=72:137, line=103, devid=0, computeconstruct=9999 )
pgi_uacc_launch funcnum=1 argptr=0x7ffd85695720 sizeargs=(nil) async=-4 devid=1
argument memory for queue 0 device:0x7ffa268fa000 host:0x7ffa266fa000
Arguments to function 1 proc_103_gpu dindex=1 threadid=1 device=0:
1048576 0 641728512 32762 68671360 0 60282720 0
64477040 0 0 0 0 0 0 0
0x00100000 0x00000000 0x26400000 0x00007ffa 0x0417d780 0x00000000 0x0397d760 0x00000000
0x03d7d770 0x00000000 0x00000000 0x00000000 0x00000000 0x00000000 0x00000000 0x00000000
Launch configuration for function=1=proc_103_gpu line=103 dindex=1 threadid=1 device=0 <<<(1,1,1),(128,1,1),0>>> async=-4
Accelerator Kernel Timing data
/home/xavierportell/mdev.c
proc NVIDIA devicenum=1
time(us): 752
92: data region reached 1 time
92: data copyin transfers: 3
device time(us): total=752 max=377 min=11 avg=250
/home/xavierportell/mdev.c
proc NVIDIA devicenum=0
time(us): 1,472
80: data region reached 2 times
80: data copyin transfers: 2
device time(us): total=32 max=22 min=10 avg=16
88: data copyout transfers: 2
device time(us): total=683 max=344 min=339 avg=341
82: compute region reached 1 time
82: kernel launched 1 time
grid: [1] block: [128]
elapsed time(us): total=3,219 max=3,219 min=3,219 avg=3,219
101: data region reached 1 time
101: data copyin transfers: 3
device time(us): total=757 max=377 min=8 avg=252
103: data region reached 1 time
103: compute region reached 1 time
103: kernel launched 1 time
grid: [1] block: [128]
device time(us): total=0 max=0 min=0 avg=0
call to cuMemFreeHost returned error 700: Illegal address during kernel execution