cuModuleGetGlobal error

Hi,

I am new to OpenACC and here is a error that I encountered when I tried to use openacc on my code.

#define T 1024*1024

int *temp1=(int *)calloc(T,sizeof(int));
double **restrict temp2 = (double **) calloc(T, sizeof(double *));

// initializing temp1 and temp2
double a = 1.0f;
double b = 2.0f;
double c = 3.0f;
double reduction_1 = 0;
//#pragma omp parallel for reduction(+:reduction_1)

#pragma acc kernels  copyin(a,b,c,temp1[0:T],temp2[0:T][0:32])
for(int n = 0; n < T; n++){
            double tem = 0;
        if (temp1[n]== 1){
            for(int f= 0;f<32;f++)
                tem += (a+b)*log(c+a*temp2[n][f]);
        }
    reduction_1 += tem;
}

The compile seems fine:

$pgcc -ta=nvidia,cc20 -Minfo test1.c
main:
39, Generating present_or_copyin(temp2[0:1048576][0:32])
Generating present_or_copyin(temp1[0:1048576])
Generating present_or_copyin©
Generating present_or_copyin(b)
Generating present_or_copyin(a)
Generating compute capability 2.0 binary
40, Loop is parallelizable
Accelerator kernel generated
40, #pragma acc loop gang /* blockIdx.x /
CC 2.0 : 30 registers; 32 shared, 104 constant, 0 local memory bytes
43, #pragma acc loop vector(32) /
threadIdx.x */
46, Sum reduction generated for reduction_1
43, Loop is parallelizable

But it fails with following error: ( I enabled NVDEBUG )

__pgi_cu_init() found 3 devices
__pgi_cu_init( file=/home/bo/test1.c, function=main, line=39, startline=14, endline=50 )
__pgi_cu_init() will use device 0 (V2.0)
__pgi_cu_init() compute context created
__pgi_cu_module3( lineno=39 )
__pgi_cu_module3 module loaded at 0x12e99040
__pgi_cu_module_function( name=0x41db08=main_40_gpu, lineno=40, argname=(nil)=, argsize=52, varname=0x41db26=b1, varsize=8, SWcachesize=2048 )
Function handle is 0x12ea6e70
__pgi_cu_module_function( name=0x41db15=main_46_gpu_red, lineno=40, argname=(nil)=, argsize=0, varname=(nil)=, varsize=0, SWcachesize=2048 )
Function handle is 0x12ea2bf0
pgi_acc_dataon(devptr=0x401d70,hostptr=0x7f1f3d20f010,offset=0,0,stride=1,-1,size=32x1048576,extent=-1x-1,eltsize=8,lineno=39,name=temp2,flags=0x701=sync+create+present+copyin)
NO map for host:0x7f1f3d20f010
__pgi_cu_alloc(size=276824064,lineno=39,name=temp2)
__pgi_cu_alloc(276824064) returns 0x200300000 (address=0x7fff8878f008)
__pgi_cu_init( file=/proj/pgrel/extract/x86/2012/rte/accel/hammer/lib-linux86-64/…/src-nv/nvfill.c, function=__pgi_cu_fill, line=26, startline=22, endline=28 )
__pgi_cu_module3( lineno=26 )
__pgi_cu_module3 module loaded at 0x12ea0970
__pgi_cu_module_function( name=0x420539=__pgi_cu_fill_27_gpu, lineno=27, argname=(nil)=, argsize=32, varname=(nil)=, varsize=0, SWcachesize=0 )
Function handle is 0x12ea1250
__pgi_cu_launch_a(func=0x12ea1250, grid=8192x1x1, block=128x1x1, lineno=27)
__pgi_cu_launch_a(func=0x12ea1250, params=0x7fff8878eba0, bytes=32, sharedbytes=0)
First arguments are:
1048576 48 11534336 2 3145728 2 256 0

0x00100000 0x00000030 0x00b00000 0x00000002 0x00300000 0x00000002 0x00000100 0x00000000
launch kernel file=/proj/pgrel/extract/x86/2012/rte/accel/hammer/lib-linux86-64/…/src-nv/nvfill.c function=__pgi_cu_fill line=27 device=0 grid=8192 block=128 queue=0
__pgi_cu_close()
map dev:0x200300000 host:0x7f1f3d20f010 size:8388608 offset:0 data[dev:0x200b00000 host:0x1d0d010 size:268435456] (line:39 name:temp2) dims=32x1048576
alloc done with devptr at 0x200300000 (address=0x7fff8878f008)
__pgi_acc_dataupx(devptr=0x200b00000,hostptr=0x7f1f3d20f010,offset=0,0,stride=1,-1,size=32x1048576,extent=-1x-1,eltsize=8,lineno=39,name=temp2,flags=0x0)
pgi_acc_dataon(devptr=0x0,hostptr=0x7f1f3da10010,offset=0,stride=1,size=1048576,extent=-1,eltsize=4,lineno=39,name=temp1,flags=0x701=sync+create+present+copyin)
NO map for host:0x7f1f3da10010
__pgi_cu_alloc(size=4194304,lineno=39,name=temp1)
__pgi_cu_alloc(4194304) returns 0x210f00000 (address=0x7fff8878f000)
map dev:0x210f00000 host:0x7f1f3da10010 size:4194304 offset:0 data[dev:0x210f00000 host:0x7f1f3da10010 size:4194304] (line:39 name:temp1)
alloc done with devptr at 0x210f00000 (address=0x7fff8878f000)
__pgi_acc_dataupx(devptr=0x210f00000,hostptr=0x7f1f3da10010,offset=0,stride=1,size=1048576,extent=-1,eltsize=4,lineno=39,name=temp1,flags=0x0)
pgi_acc_dataon(devptr=0x41d9e0,hostptr=0x7fff8878eeb8,eltsize=8,lineno=39,name=c,flags=0x701=sync+create+present+copyin)
NO map for host:0x7fff8878eeb8
__pgi_cu_alloc(size=8,lineno=39,name=c)
__pgi_cu_alloc(8) returns 0x211300000 (address=0x7fff8878eff8)
map dev:0x211300000 host:0x7fff8878eeb8 size:8 offset:0 data[dev:0x211300000 host:0x7fff8878eeb8 size:8] (line:39 name:c)
alloc done with devptr at 0x211300000 (address=0x7fff8878eff8)
__pgi_acc_dataupx(devptr=0x211300000,hostptr=0x7fff8878eeb8,eltsize=8,lineno=39,name=c,flags=0x0)
pgi_acc_dataon(devptr=0x30fcd91730,hostptr=0x7fff8878eec0,eltsize=8,lineno=39,name=b,flags=0x701=sync+create+present+copyin)
NO map for host:0x7fff8878eec0
__pgi_cu_alloc(size=8,lineno=39,name=b)
__pgi_cu_alloc(8) returns 0x211300200 (address=0x7fff8878eff0)
map dev:0x211300200 host:0x7fff8878eec0 size:8 offset:0 data[dev:0x211300200 host:0x7fff8878eec0 size:8] (line:39 name:b)
alloc done with devptr at 0x211300200 (address=0x7fff8878eff0)
__pgi_acc_dataupx(devptr=0x211300200,hostptr=0x7fff8878eec0,eltsize=8,lineno=39,name=b,flags=0x0)
pgi_acc_dataon(devptr=0x41da25,hostptr=0x7fff8878eec8,eltsize=8,lineno=39,name=a,flags=0x701=sync+create+present+copyin)
NO map for host:0x7fff8878eec8
__pgi_cu_alloc(size=8,lineno=39,name=a)
__pgi_cu_alloc(8) returns 0x211300400 (address=0x7fff8878efe8)
map dev:0x211300400 host:0x7fff8878eec8 size:8 offset:0 data[dev:0x211300400 host:0x7fff8878eec8 size:8] (line:39 name:a)
alloc done with devptr at 0x211300400 (address=0x7fff8878efe8)
__pgi_acc_dataupx(devptr=0x211300400,hostptr=0x7fff8878eec8,eltsize=8,lineno=39,name=a,flags=0x0)
__pgi_cu_alloc(size=524280,lineno=40,name=)
__pgi_cu_alloc(524280) returns 0x211400000
__pgi_cu_uploadc( “b1”, size=8, offset=0, lineno=40 )
call to cuModuleGetGlobal returned error 500: Not found
CUDA driver version: 4010



Please give me a hint what should I do or what the problem could be.

Thank you in advance.

Hi bofang,

Can you double check that your code works in host code (i.e. don’t compile with -acc)? I don’t see where you allocate temp2’s second dimension.

You can also try using the C99 VLA syntax to declare temp2.

double * restrict temp2[32] = (double *) calloc(T*32, sizeof(double));
  • Mat

Sorry that I didn’t paste that part of the code here. This piece of code works on the host.

 for (int i = 0; i < T; i++){

        temp2[i] = (double *) calloc(32, sizeof(double));
    }

Ok, then I don’t see anything obvious. Please post a complete reproducing example or send one to PGI Customer Service (trs@progup.com) and ask them to forward it to me.

Note while it shouldn’t cause this error, you don’t need to copyin your scalar variables. These will be passed as argument to your kernels so don’t need to be explicitly copied.

  • Mat

OK. Here is my complete code:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define T 1024*1024


double fRand(double fMin, double fMax)
{
    double f = (double)rand() / RAND_MAX;
    return fMin + f * (fMax - fMin);
}

int main(void){

int *temp1=(int *)calloc(T,sizeof(int));
    double **restrict temp2 = (double **) calloc(T, sizeof(double *));
    for (int i = 0; i < T; i++){

        temp2[i] = (double *) calloc(32, sizeof(double));
    }

srand(time(NULL));
for(int i = 0; i < T;i++)
{    temp1[i] = rand()%2;
    // printf("Temp1  is %d\n",temp1[i]);
}
for(int i = 0; i < T; i++)
{
    for(int j = 0; j < 32; j++)
    temp2[i][j] = fRand(1,10);
}
double a = 1.0f;
double b = 2.0f;
double c = 3.0f;
double reduction_1 = 0;
//#pragma omp parallel for reduction(+:reduction_1)

#pragma acc kernels  /*copyin(a,b,c,temp1[0:1024*1024],temp2[0:T][0:32])*/
for(int n = 0; n < T; n++){
            double tem = 0;
        if (temp1[n]== 1){
            for(int f= 0;f<32;f++)
                tem += (a+b)*log(c+a*temp2[n][f]);
        }
    reduction_1 += tem;
}
printf("Result is %f\n",reduction_1);
 return 1;
}

The command I am using to compile is

pgcc -ta=nvidia,cc20 test1.c -Minfo


Thank you Mat

Hi bofang,

Odd. For some reason the runtime can’t find the device copy of temp2. I have filed a problem report (TPR#19044) and sent to our compiler engineers. The work around is to add a data region:

% cat test.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define T 1024*1024


double fRand(double fMin, double fMax)
{
    double f = (double)rand() / RAND_MAX;
    return fMin + f * (fMax - fMin);
}

int main(void){

int *temp1=(int *)calloc(T,sizeof(int));
    double **restrict temp2 = (double **) calloc(T, sizeof(double *));
    for (int i = 0; i < T; i++){

        temp2[i] = (double *) calloc(32, sizeof(double));
    }

srand(time(NULL));
for(int i = 0; i < T;i++)
{    temp1[i] = rand()%2;
    // printf("Temp1  is %d\n",temp1[i]);
}
for(int i = 0; i < T; i++)
{
    for(int j = 0; j < 32; j++)
    temp2[i][j] = fRand(1,10);
}
double a = 1.0f;
double b = 2.0f;
double c = 3.0f;
double reduction_1 = 0;
//#pragma omp parallel for reduction(+:reduction_1)
#pragma acc data copyin(temp1[0:T],temp2[0:T][0:32])
 {
#pragma acc kernels 
for(int n = 0; n < T; n++){
            double tem = 0;
        if (temp1[n]== 1){
            for(int f= 0;f<32;f++)
                tem += (a+b)*log(c+a*temp2[n][f]);
        }
    reduction_1 += tem;
}
 }
printf("Result is %f\n",reduction_1);
 return 1;
} 
% pgcc -Minfo=accel test.c -V12.9 -acc
main:
     38, Generating copyin(temp2[0:1048576][0:32])
         Generating copyin(temp1[0:1048576])
     40, Generating present_or_copyin(temp2[0:1048576][0:32])
         Generating present_or_copyin(temp1[0:1048576])
         Generating compute capability 1.3 binary
         Generating compute capability 2.0 binary
     41, Loop is parallelizable
         Accelerator kernel generated
         41, #pragma acc loop gang /* blockIdx.x */
             CC 1.3 : 31 registers; 96 shared, 52 constant, 0 local memory bytes
             CC 2.0 : 28 registers; 32 shared, 104 constant, 0 local memory bytes
         44, #pragma acc loop vector(32) /* threadIdx.x */
         47, Sum reduction generated for reduction_1
     44, Loop is parallelizable
% a.out
Result is 105127285.494059

Accelerator Kernel Timing data
./test.c
  main
    40: region entered 1 time
        time(us): total=32,520
                  kernels=25,658
        41: kernel launched 1 times
            grid: [65535]  block: [32]
            time(us): total=25,552 max=25,552 min=25,552 avg=25,552
        47: kernel launched 1 times
            grid: [1]  block: [256]
            time(us): total=106 max=106 min=106 avg=106
/proj/pgrel/extract/x86/2012/rte/accel/hammer/lib-linux86-64/../src-nv/nvfill.c
  __pgi_cu_fill
    26: region entered 1 time
        time(us): total=32,227
                  kernels=83
        27: kernel launched 1 times
            grid: [8192]  block: [128]
            time(us): total=83 max=83 min=83 avg=83
./test.c
  main
    38: region entered 1 time
        time(us): total=5,909,665 init=67,314 region=5,842,351
                  data=5,773,474
        w/o init: total=5,842,351 max=5,842,351 min=5,842,351 avg=5,842,351

One thing to note is that your performance will be very poor since it takes almost 6 seconds to copy your data to the GPU (at least on my system). The problem being that data needs to be copied in contiguous blocks so your temp2 can’t be copied in one large chunk. Instead, you have 1048576 copies of 32 elements each. To fix, you should move to using a 1-D array. For example:

% cat test2.c 
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define T 1024*1024
#define TT 32

#ifdef _OPENACC
#include <accelmath.h>
#endif

double fRand(double fMin, double fMax)
{
    double f = (double)rand() / RAND_MAX;
    return fMin + f * (fMax - fMin);
}

int main(void){

int *temp1=(int *)calloc(T,sizeof(int));
double *restrict temp2 = (double *) calloc(T*TT, sizeof(double));
/* 
 for (int i = 0; i < T; i++){
   temp2[i] = (double *) calloc(32, sizeof(double));
 }
*/

srand(time(NULL));
for(int i = 0; i < T;i++)
{    temp1[i] = rand()%2;
    // printf("Temp1  is %d\n",temp1[i]);
}
for(int i = 0; i < T; i++)
{
    for(int j = 0; j < TT; j++)
    temp2[i*TT+j] = fRand(1,10);
}
double a = 1.0f;
double b = 2.0f;
double c = 3.0f;
double reduction_1 = 0;
//#pragma omp parallel for reduction(+:reduction_1)

#pragma acc data copyin(a,b,c,temp1[0:T],temp2[0:T*TT])
 {
   reduction_1 = 0.0;
#pragma acc kernels loop 
   for(int n = 0; n < T; n++){
     double tem = 0;
     if (temp1[n]== 1){
       for(int f= 0;f<TT;f++)
	 tem += (a+b)*log(c+a*temp2[n*TT+f]);
     }
    reduction_1 += tem;
   }
 }
printf("Result is %f\n",reduction_1);
 return 1;
} 

% pgcc -Minfo=accel test2.c -V12.9 -acc
main:
     44, Generating copyin(temp2[0:33554432])
         Generating copyin(temp1[0:1048576])
         Generating copyin(c)
         Generating copyin(b)
         Generating copyin(a)
     47, Generating present_or_copyin(temp2[0:33554432])
         Generating present_or_copyin(temp1[0:1048576])
         Generating compute capability 1.3 binary
         Generating compute capability 2.0 binary
     48, Loop is parallelizable
         Accelerator kernel generated
         48, #pragma acc loop gang /* blockIdx.x */
             CC 1.3 : 33 registers; 96 shared, 52 constant, 0 local memory bytes
             CC 2.0 : 30 registers; 32 shared, 104 constant, 0 local memory bytes
         51, #pragma acc loop vector(32) /* threadIdx.x */
         54, Sum reduction generated for reduction_1
     51, Loop is parallelizable

% a.out
Result is 105064474.847449

Accelerator Kernel Timing data
./test2.c
  main
    47: region entered 1 time
        time(us): total=119,004
                  kernels=25,406
        48: kernel launched 1 times
            grid: [65535]  block: [32]
            time(us): total=25,300 max=25,300 min=25,300 avg=25,300
        54: kernel launched 1 times
            grid: [1]  block: [256]
            time(us): total=106 max=106 min=106 avg=106
./test2.c
  main
    44: region entered 1 time
        time(us): total=280,812 init=98,856 region=181,956
                  data=58,649
        w/o init: total=181,956 max=181,956 min=181,956 avg=181,956

With this change, the data transfer time goes from 5,773,474 ms down to 58,649 ms.

  • Mat

Hi Mat,

Those are very good suggestions! Thank you.

One interesting thing is that after I applied all tricks you mentioned in the code, here is my time profile:

Accelerator Kernel Timing data
/home/bo/test1.c
  main
    42: region entered 1 time
        time(us): total=28,935 init= region=28,934
                  kernels=27,537
        w/o init: total=28,934 max=28,934 min=28,934 avg=28,934
        43: kernel launched 1 times
            grid: [65535]  block: [32]
            time(us): total=27,402 max=27,402 min=27,402 avg=27,402
        49: kernel launched 1 times
            grid: [1]  block: [256]
            time(us): total=135 max=135 min=135 avg=135
/home/bo/test1.c
  main
    39: region entered 1 time
        time(us): total=6,079,576 init=5,953,711 region=125,865
                  data=90,984
        w/o init: total=125,865 max=125,865 min=125,865 avg=125,865

As you can see the data transfer time is 90,984. Which is ok. But init time is 5,953,711, which is quite large comparing to your 98,856. Any ideas?

Thanks a lot!

Bo

Hi Bo,

Linux will power down the GPU when not in use and it takes about 1 second per device to start it back-up. You can run the PGI utility pgcudainit as background task to hold the devices open so you minimize the initialization cost.

Hope this helps,
Mat

Oh thats good to know.

Thank you

Bo

Hi Mat,

I found an interesting thing here:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <time.h>
#define T 4756

void stopwatch_start(double* stopwatch) {
  struct timeval tval;
  gettimeofday(&tval, NULL);
  *stopwatch = (tval.tv_sec * 1000 + tval.tv_usec/1000.0);
}

/**                                                                                                                                            
 *  *  * Returns the elapsed time since the stopwatch started via stopwatch_start                                                                 
 *   *   * @param[in] stopwatch the stopwatch handler                                                                                              
 *    *    * @return elapsed time in milliseconds                                                                                                   
 *     *     */
double stopwatch_elapsed(double* stopwatch) {
  struct timeval tval;
  gettimeofday(&tval, NULL);
  return ((tval.tv_sec * 1000 + tval.tv_usec/1000.0) - *stopwatch);
}


double fRand(double fMin, double fMax)
{
    double f = (double)rand() / RAND_MAX;
    return fMin + f * (fMax - fMin);
}

int main(void){
unsigned int len1 = 3182; 
int *temp1=(int *)calloc(T,sizeof(int));
    double * temp2 = (double *) calloc(T*32, sizeof(double *));
  /*  for (int i = 0; i < T; i++){
  
        temp2[i] = (double *) calloc(32, sizeof(double));
    }
*/
double a = 1.0f;
double b = 2.0f;
double c = 3.0f;
double reduction_1 = 0;

double timer;
stopwatch_start(&timer);
// high level loop
for(int r_est = 0; r_est < len1; r_est ++){


for(int i = 0; i < T;i++)
{    
     if(i%2 == 0)
     temp1[i] = 0;
     else 
     temp1[i] = 1;
    // printf("Temp1  is %d\n",temp1[i]);
}
for(int i = 0; i < T; i++)
{
    for(int j = 0; j < 32; j++)
    temp2[i*32+j] = 1.5f;
}

#pragma acc data  copyin(temp1[0:T],temp2[0:T*32])
{// printf("enter %d times\n",r_est);
    #pragma acc kernels
    for(int n = 0; n < T; n++){
            double tem = 0;
        if (temp1[n]== 1){
            for(int f= 0;f<32;f++)
                tem += (a+b)*log(c+a*temp2[n*32+f]);
        }
        reduction_1 += tem;
    }
    #pragma acc kernels
    for(int n = 0; n < T; n++){
            double tem = 0;
        if (temp1[n]== 0){
            for(int f= 0;f<32;f++)
                tem += (a+b)*log(c+a*temp2[n*32+f]);
        }
        reduction_1 += tem;
    }
    
    
}

}
printf("Result is %f,  time is %f\n",reduction_1,stopwatch_elapsed(&timer));
 return 1;
}

If I compile the above code in g++, it needs 22 s to run on my machine. If I compile it with pgcc, it takes 8s to finish. If I compile it with pgcc+openacc, it takes 7.5s to finish. I am wondering how pgcc optimize the code so that it can have a very good performance? I attached my openacc timing below. 50% of time has been spent in initialization, but I did keep the pgcudainit run in the background. Is there anything I can do to minimize that time? Thank you

Please note that I comment out the openacc pragma when I compile it with g++ and pgcc.

Accelerator Kernel Timing data
/home/bo/test1.c
  main
    78: region entered 3182 times
        time(us): total=630,282 init=216 region=630,066
                  kernels=493,211
        w/o init: total=630,066 max=329 min=196 avg=198
        79: kernel launched 3182 times
            grid: [4756]  block: [32]
            time(us): total=453,832 max=149 min=142 avg=142
        85: kernel launched 3182 times
            grid: [1]  block: [256]
            time(us): total=39,379 max=19 min=12 avg=12
/home/bo/test1.c
  main
    69: region entered 3182 times
        time(us): total=656,918 init=479 region=656,439
                  kernels=509,233
        w/o init: total=656,439 max=1,105 min=204 avg=206
        70: kernel launched 3182 times
            grid: [4756]  block: [32]
            time(us): total=458,228 max=157 min=143 avg=144
        76: kernel launched 3182 times
            grid: [1]  block: [256]
            time(us): total=51,005 max=30 min=15 avg=16
/home/bo/test1.c
  main
    67: region entered 3182 times
        time(us): total=7,424,109 init=3,092,275 region=4,331,834
                  data=2,019,706
        w/o init: total=4,331,834 max=4,352 min=1,313 avg=1,361

[/code]

I am wondering how pgcc optimize the code so that it can have a very good performance?

While my gcc time isn’t a bad as yours, yes PGI is much faster, even without optimization. With “-Minfo” you can see some of the optimization performed. Also, I believe our log function is faster.

% gcc interesting.c -O0 -std=c99 -lm ; a.out
Result is 2185160991.755704,  time is 9168.019775
% gcc interesting.c -O3 -ftree-vectorize -ffast-math -std=c99 -lm ; a.out
Result is 2185160991.755704,  time is 6343.856934
% pgcc interesting.c -Minfo -O0 -Msafeptr ; a.out
Result is 2185160991.755704,  time is 7534.363037
% pgcc interesting.c -Minfo -fast -Msafeptr ; a.out
main:
     55, Loop not vectorized: may not be beneficial
         Unrolled inner loop 8 times
         Residual loop unrolled 4 times (completely unrolled)
     67, Memory set idiom, loop replaced by call to __c_mset8
     75, Generated vector sse code for the loop
         Generated a prefetch instruction for the loop
     83, Generated vector sse code for the loop
         Generated a prefetch instruction for the loop
Result is 2185160991.755704,  time is 4462.258057



Is there anything I can do to minimize that time?

Hoist the data region above the outer loop and put the data init routines on the accelerator:

% cat interesting.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <time.h>
#define T 4756

void stopwatch_start(double* stopwatch) {
  struct timeval tval;
  gettimeofday(&tval, NULL);
  *stopwatch = (tval.tv_sec * 1000 + tval.tv_usec/1000.0);
}

/**                                                                                                                                           
 *  *  * Returns the elapsed time since the stopwatch started via stopwatch_start                                            
 *   *   * @param[in] stopwatch the stopwatch handler                                         
 *    *    * @return elapsed time in milliseconds                                     
 *     *     */
double stopwatch_elapsed(double* stopwatch) {
  struct timeval tval;
  gettimeofday(&tval, NULL);
  return ((tval.tv_sec * 1000 + tval.tv_usec/1000.0) - *stopwatch);
}


double fRand(double fMin, double fMax)
{
  double f = (double)rand() / RAND_MAX;
  return fMin + f * (fMax - fMin);
}

int main(void){
  unsigned int len1 = 3182;
  int *temp1=(int *)calloc(T,sizeof(int));
  double * temp2 = (double *) calloc(T*32, sizeof(double *));
  /*  for (int i = 0; i < T; i++){
 
      temp2[i] = (double *) calloc(32, sizeof(double));
      }
  */
  double a = 1.0f;
  double b = 2.0f;
  double c = 3.0f;
  double reduction_1 = 0;

  double timer;
  stopwatch_start(&timer);
  // high level loop
#pragma acc data create(temp1[0:T],temp2[0:T*32])
  {
    for(int r_est = 0; r_est < len1; r_est ++){

#pragma acc kernels 
      {
	for(int i = 0; i < T;i++)
	  {   
	    if(i%2 == 0)
	      temp1[i] = 0;
	    else
	      temp1[i] = 1;
	    // printf("Temp1  is %d\n",temp1[i]);
	  }
#pragma acc loop independent
	for(int i = 0; i < T; i++)
	  {
#pragma acc loop independent
	    for(int j = 0; j < 32; j++)
	      temp2[i*32+j] = 1.5f;
	  }

	// printf("enter %d times\n",r_est);
	for(int n = 0; n < T; n++){
	  double tem = 0;
	  if (temp1[n]== 1){
            for(int f= 0;f<32;f++)
	      tem += (a+b)*log(c+a*temp2[n*32+f]);
	  }
	  reduction_1 += tem;
	}
	for(int n = 0; n < T; n++){
	  double tem = 0;
	  if (temp1[n]== 0){
            for(int f= 0;f<32;f++)
	      tem += (a+b)*log(c+a*temp2[n*32+f]);
	  }
	  reduction_1 += tem;
	}
	
      }
    }
  }
  printf("Result is %f,  time is %f\n",reduction_1,stopwatch_elapsed(&timer));
  return 1;
} 
% pgcc interesting.c -Minfo=accel -fast -acc -ta=nvidia,4.2 -Msafeptr ; a.out
main:
     49, Generating create(temp2[0:152192])
         Generating create(temp1[0:4756])
     53, Generating present_or_create(temp2[0:152192])
         Generating present_or_create(temp1[0:4756])
         Generating compute capability 1.3 binary
         Generating compute capability 2.0 binary
     55, Loop is parallelizable
         Accelerator kernel generated
         55, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
             CC 1.3 : 8 registers; 32 shared, 20 constant, 0 local memory bytes
             CC 2.0 : 12 registers; 0 shared, 56 constant, 0 local memory bytes
     64, Loop is parallelizable
     67, Loop is parallelizable
         Accelerator kernel generated
         64, #pragma acc loop gang, vector(8) /* blockIdx.y threadIdx.y */
         67, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
             CC 1.3 : 11 registers; 32 shared, 20 constant, 0 local memory bytes
             CC 2.0 : 14 registers; 0 shared, 56 constant, 0 local memory bytes
     72, Loop is parallelizable
         Accelerator kernel generated
         72, #pragma acc loop gang /* blockIdx.x */
             CC 1.3 : 31 registers; 96 shared, 60 constant, 0 local memory bytes
             CC 2.0 : 28 registers; 32 shared, 112 constant, 0 local memory bytes
         75, #pragma acc loop vector(32) /* threadIdx.x */
         78, Sum reduction generated for reduction_1
     75, Loop is parallelizable
     80, Loop is parallelizable
         Accelerator kernel generated
         80, #pragma acc loop gang /* blockIdx.x */
             CC 1.3 : 31 registers; 96 shared, 60 constant, 0 local memory bytes
             CC 2.0 : 28 registers; 32 shared, 112 constant, 0 local memory bytes
         83, #pragma acc loop vector(32) /* threadIdx.x */
         86, Sum reduction generated for reduction_1
     83, Loop is parallelizable
Result is 2185160991.286413,  time is 1749.938965

Accelerator Kernel Timing data
interesting.c
  main
    53: region entered 3182 times
        time(us): total=1,645,141 init=183 region=1,644,958
                  kernels=976,429
        w/o init: total=1,644,958 max=189,133 min=440 avg=516
        55: kernel launched 3182 times
            grid: [38]  block: [128]
            time(us): total=39,806 max=197 min=11 avg=12
        67: kernel launched 3182 times
            grid: [1x595]  block: [32x8]
            time(us): total=64,299 max=38 min=18 avg=20
        72: kernel launched 3182 times
            grid: [4756]  block: [32]
            time(us): total=373,710 max=444 min=114 avg=117
        78: kernel launched 3182 times
            grid: [1]  block: [256]
            time(us): total=63,636 max=42 min=14 avg=19
        80: kernel launched 3182 times
            grid: [4756]  block: [32]
            time(us): total=371,370 max=304 min=113 avg=116
        86: kernel launched 3182 times
            grid: [1]  block: [256]
            time(us): total=63,608 max=42 min=18 avg=19
interesting.c
  main
    49: region entered 1 time
        time(us): total=1,749,932 init=103,785 region=1,646,147
        w/o init: total=1,646,147 max=1,646,147 min=1,646,147 avg=1,646,147

Hi Mat,

I modify the code as what you pointed, and my gcc version is still not good :). Perhaps it has something to do with the CPU that I am using.

Anyway, for GPU+OpenACC, below is my new timing info. My initialzation time is still 3s. I realized that could it be due to that I have three GPUs on my machine and I am only using one of them? I couldn’t find a suitable command for me to disable or not initialize the other two GPUs.

bo@node240 ~]$ ./a.out 
Result is 2185160991.286413,  time is 4671.968018

Accelerator Kernel Timing data
/home/bo/test1.c
  main
    54: region entered 3182 times
        time(us): total=1,425,879 init=248 region=1,425,631
                  kernels=1,099,710
        w/o init: total=1,425,631 max=1,793 min=444 avg=448
        57: kernel launched 3182 times
            grid: [38]  block: [128]
            time(us): total=14,595 max=17 min=4 avg=4
        68: kernel launched 3182 times
            grid: [1x595]  block: [32x8]
            time(us): total=69,944 max=27 min=21 avg=21
        72: kernel launched 3182 times
            grid: [4756]  block: [32]
            time(us): total=458,477 max=155 min=143 avg=144
        78: kernel launched 3182 times
            grid: [1]  block: [256]
            time(us): total=50,967 max=27 min=15 avg=16
        80: kernel launched 3182 times
            grid: [4756]  block: [32]
            time(us): total=454,763 max=149 min=142 avg=142
        86: kernel launched 3182 times
            grid: [1]  block: [256]
            time(us): total=50,964 max=21 min=15 avg=16
/home/bo/test1.c
  main
    49: region entered 1 time
        time(us): total=4,671,945 init=3,244,745 region=1,427,200
        w/o init: total=1,427,200 max=1,427,200 min=1,427,200 avg=1,427,200

My initialzation time is still 3s. I realized that could it be due to that I have three GPUs on my machine and I am only using one of them?

Yes, the init time is ~1 second per device, even if you’re only using one of the. Though, pgcudainit should hold them all open.

You might want to investigate using NVDIA’s smi utility: https://developer.nvidia.com/nvidia-system-management-interface

  • Mat