Hi,
I am new to OpenACC and here is a error that I encountered when I tried to use openacc on my code.
#define T 1024*1024
int *temp1=(int *)calloc(T,sizeof(int));
double **restrict temp2 = (double **) calloc(T, sizeof(double *));
// initializing temp1 and temp2
double a = 1.0f;
double b = 2.0f;
double c = 3.0f;
double reduction_1 = 0;
//#pragma omp parallel for reduction(+:reduction_1)
#pragma acc kernels copyin(a,b,c,temp1[0:T],temp2[0:T][0:32])
for(int n = 0; n < T; n++){
double tem = 0;
if (temp1[n]== 1){
for(int f= 0;f<32;f++)
tem += (a+b)*log(c+a*temp2[n][f]);
}
reduction_1 += tem;
}
The compile seems fine:
$pgcc -ta=nvidia,cc20 -Minfo test1.c
main:
39, Generating present_or_copyin(temp2[0:1048576][0:32])
Generating present_or_copyin(temp1[0:1048576])
Generating present_or_copyin(c)
Generating present_or_copyin(b)
Generating present_or_copyin(a)
Generating compute capability 2.0 binary
40, Loop is parallelizable
Accelerator kernel generated
40, #pragma acc loop gang /* blockIdx.x /
CC 2.0 : 30 registers; 32 shared, 104 constant, 0 local memory bytes
43, #pragma acc loop vector(32) / threadIdx.x */
46, Sum reduction generated for reduction_1
43, Loop is parallelizable
But it fails with following error: ( I enabled NVDEBUG )
__pgi_cu_init() found 3 devices
__pgi_cu_init( file=/home/bo/test1.c, function=main, line=39, startline=14, endline=50 )
__pgi_cu_init() will use device 0 (V2.0)
__pgi_cu_init() compute context created
__pgi_cu_module3( lineno=39 )
__pgi_cu_module3 module loaded at 0x12e99040
__pgi_cu_module_function( name=0x41db08=main_40_gpu, lineno=40, argname=(nil)=, argsize=52, varname=0x41db26=b1, varsize=8, SWcachesize=2048 )
Function handle is 0x12ea6e70
__pgi_cu_module_function( name=0x41db15=main_46_gpu_red, lineno=40, argname=(nil)=, argsize=0, varname=(nil)=, varsize=0, SWcachesize=2048 )
Function handle is 0x12ea2bf0
pgi_acc_dataon(devptr=0x401d70,hostptr=0x7f1f3d20f010,offset=0,0,stride=1,-1,size=32x1048576,extent=-1x-1,eltsize=8,lineno=39,name=temp2,flags=0x701=sync+create+present+copyin)
NO map for host:0x7f1f3d20f010
__pgi_cu_alloc(size=276824064,lineno=39,name=temp2)
__pgi_cu_alloc(276824064) returns 0x200300000 (address=0x7fff8878f008)
__pgi_cu_init( file=/proj/pgrel/extract/x86/2012/rte/accel/hammer/lib-linux86-64/…/src-nv/nvfill.c, function=__pgi_cu_fill, line=26, startline=22, endline=28 )
__pgi_cu_module3( lineno=26 )
__pgi_cu_module3 module loaded at 0x12ea0970
__pgi_cu_module_function( name=0x420539=__pgi_cu_fill_27_gpu, lineno=27, argname=(nil)=, argsize=32, varname=(nil)=, varsize=0, SWcachesize=0 )
Function handle is 0x12ea1250
__pgi_cu_launch_a(func=0x12ea1250, grid=8192x1x1, block=128x1x1, lineno=27)
__pgi_cu_launch_a(func=0x12ea1250, params=0x7fff8878eba0, bytes=32, sharedbytes=0)
First arguments are:
1048576 48 11534336 2 3145728 2 256 0
0x00100000 0x00000030 0x00b00000 0x00000002 0x00300000 0x00000002 0x00000100 0x00000000
launch kernel file=/proj/pgrel/extract/x86/2012/rte/accel/hammer/lib-linux86-64/…/src-nv/nvfill.c function=__pgi_cu_fill line=27 device=0 grid=8192 block=128 queue=0
__pgi_cu_close()
map dev:0x200300000 host:0x7f1f3d20f010 size:8388608 offset:0 data[dev:0x200b00000 host:0x1d0d010 size:268435456] (line:39 name:temp2) dims=32x1048576
alloc done with devptr at 0x200300000 (address=0x7fff8878f008)
__pgi_acc_dataupx(devptr=0x200b00000,hostptr=0x7f1f3d20f010,offset=0,0,stride=1,-1,size=32x1048576,extent=-1x-1,eltsize=8,lineno=39,name=temp2,flags=0x0)
pgi_acc_dataon(devptr=0x0,hostptr=0x7f1f3da10010,offset=0,stride=1,size=1048576,extent=-1,eltsize=4,lineno=39,name=temp1,flags=0x701=sync+create+present+copyin)
NO map for host:0x7f1f3da10010
__pgi_cu_alloc(size=4194304,lineno=39,name=temp1)
__pgi_cu_alloc(4194304) returns 0x210f00000 (address=0x7fff8878f000)
map dev:0x210f00000 host:0x7f1f3da10010 size:4194304 offset:0 data[dev:0x210f00000 host:0x7f1f3da10010 size:4194304] (line:39 name:temp1)
alloc done with devptr at 0x210f00000 (address=0x7fff8878f000)
__pgi_acc_dataupx(devptr=0x210f00000,hostptr=0x7f1f3da10010,offset=0,stride=1,size=1048576,extent=-1,eltsize=4,lineno=39,name=temp1,flags=0x0)
pgi_acc_dataon(devptr=0x41d9e0,hostptr=0x7fff8878eeb8,eltsize=8,lineno=39,name=c,flags=0x701=sync+create+present+copyin)
NO map for host:0x7fff8878eeb8
__pgi_cu_alloc(size=8,lineno=39,name=c)
__pgi_cu_alloc(8) returns 0x211300000 (address=0x7fff8878eff8)
map dev:0x211300000 host:0x7fff8878eeb8 size:8 offset:0 data[dev:0x211300000 host:0x7fff8878eeb8 size:8] (line:39 name:c)
alloc done with devptr at 0x211300000 (address=0x7fff8878eff8)
__pgi_acc_dataupx(devptr=0x211300000,hostptr=0x7fff8878eeb8,eltsize=8,lineno=39,name=c,flags=0x0)
pgi_acc_dataon(devptr=0x30fcd91730,hostptr=0x7fff8878eec0,eltsize=8,lineno=39,name=b,flags=0x701=sync+create+present+copyin)
NO map for host:0x7fff8878eec0
__pgi_cu_alloc(size=8,lineno=39,name=b)
__pgi_cu_alloc(8) returns 0x211300200 (address=0x7fff8878eff0)
map dev:0x211300200 host:0x7fff8878eec0 size:8 offset:0 data[dev:0x211300200 host:0x7fff8878eec0 size:8] (line:39 name:b)
alloc done with devptr at 0x211300200 (address=0x7fff8878eff0)
__pgi_acc_dataupx(devptr=0x211300200,hostptr=0x7fff8878eec0,eltsize=8,lineno=39,name=b,flags=0x0)
pgi_acc_dataon(devptr=0x41da25,hostptr=0x7fff8878eec8,eltsize=8,lineno=39,name=a,flags=0x701=sync+create+present+copyin)
NO map for host:0x7fff8878eec8
__pgi_cu_alloc(size=8,lineno=39,name=a)
__pgi_cu_alloc(8) returns 0x211300400 (address=0x7fff8878efe8)
map dev:0x211300400 host:0x7fff8878eec8 size:8 offset:0 data[dev:0x211300400 host:0x7fff8878eec8 size:8] (line:39 name:a)
alloc done with devptr at 0x211300400 (address=0x7fff8878efe8)
__pgi_acc_dataupx(devptr=0x211300400,hostptr=0x7fff8878eec8,eltsize=8,lineno=39,name=a,flags=0x0)
__pgi_cu_alloc(size=524280,lineno=40,name=)
__pgi_cu_alloc(524280) returns 0x211400000
__pgi_cu_uploadc( “b1”, size=8, offset=0, lineno=40 )
call to cuModuleGetGlobal returned error 500: Not found
CUDA driver version: 4010
Please give me a hint what should I do or what the problem could be.
Thank you in advance.