Hello Mat, I tried using gettime() to compare CPU and GPU runtime, but I found that sometimes gettime() does not provide good timing, as shown in the following example
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include <openacc.h>
#include <accelmath.h>
#if defined(_WIN32) || defined(_WIN64)
#include <sys/timeb.h>
#define gettime(a) _ftime(a)
#define usec(t1,t2) ((((t2).time-(t1).time)*1000+((t2).millitm-(t1).millitm))*100)
typedef struct _timeb timestruct;
#else
#include <sys/time.h>
#define gettime(a) gettimeofday(a,NULL)
#define usec(t1,t2) (((t2).tv_sec-(t1).tv_sec)*1000000+((t2).tv_usec-(t1).tv_usec))
typedef struct timeval timestruct;
#endif
int main(int argc, char* argv[])
{
int n=10000000; /* size of the vector */
float *a; /* the vector */
float *r; /* the results */
float *e; /* expected results */
float s, c;
timestruct t1, t2, t3;
long long cgpu, chost;
int i, j;
a = (float*)malloc(n*sizeof(float));
r = (float*)malloc(n*sizeof(float));
e = (float*)malloc(n*sizeof(float));
for( i = 0; i < n; ++i ) a[i] = (float)(i+1) * 2.0f;
acc_init( acc_device_nvidia );
gettime( &t1 );
#pragma acc loop independent
for( i = 0; i < n; ++i ){
s = sinf(a[i]);
c = cosf(a[i]);
r[i] = s*s + c*c;
}
gettime( &t2 );
cgpu = usec(t1,t2);
for( i = 0; i < n; ++i ){
s = sinf(a[i]);
c = cosf(a[i]);
e[i] = s*s + c*c;
}
gettime( &t3 );
chost = usec(t2,t3);
printf( "%13d iterations completed\n", n );
printf( "%13ld microseconds on GPU\n", cgpu );
printf( "%13ld microseconds on host\n", chost );
return 0;
}
The compiled run results are as follows:
"pwz66.c", line 31: warning: variable "j" was declared but never referenced [declared_but_not_referenced]
int i, j;
^
Remark: individual warnings can be suppressed with "--diag_suppress <warning-name>"
10000000 iterations completed
0 microseconds on GPU
0 microseconds on host
But if I modify the OpenACC introduction, it seems normal
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include <openacc.h>
#include <accelmath.h>
#if defined(_WIN32) || defined(_WIN64)
#include <sys/timeb.h>
#define gettime(a) _ftime(a)
#define usec(t1,t2) ((((t2).time-(t1).time)*1000+((t2).millitm-(t1).millitm))*100)
typedef struct _timeb timestruct;
#else
#include <sys/time.h>
#define gettime(a) gettimeofday(a,NULL)
#define usec(t1,t2) (((t2).tv_sec-(t1).tv_sec)*1000000+((t2).tv_usec-(t1).tv_usec))
typedef struct timeval timestruct;
#endif
int main(int argc, char* argv[])
{
int n=10000000; /* size of the vector */
float *a; /* the vector */
float *r; /* the results */
float *e; /* expected results */
float s, c;
timestruct t1, t2, t3;
long long cgpu, chost;
int i, j;
a = (float*)malloc(n*sizeof(float));
r = (float*)malloc(n*sizeof(float));
e = (float*)malloc(n*sizeof(float));
for( i = 0; i < n; ++i ) a[i] = (float)(i+1) * 2.0f;
acc_init( acc_device_nvidia );
gettime( &t1 );
#pragma acc kernels copyin(a[0:n])\
create(s[0:n],c[0:n])\
copyout(r[0:n])
#pragma acc loop independent
for( i = 0; i < n; ++i ){
s = sinf(a[i]);
c = cosf(a[i]);
r[i] = s*s + c*c;
}
gettime( &t2 );
cgpu = usec(t1,t2);
for( i = 0; i < n; ++i ){
s = sinf(a[i]);
c = cosf(a[i]);
e[i] = s*s + c*c;
}
gettime( &t3 );
chost = usec(t2,t3);
printf( "%13d iterations completed\n", n );
printf( "%13ld microseconds on GPU\n", cgpu );
printf( "%13ld microseconds on host\n", chost );
return 0;
}
give the result as follows
"pwz66.c", line 31: warning: variable "j" was declared but never referenced [declared_but_not_referenced]
int i, j;
^
Remark: individual warnings can be suppressed with "--diag_suppress <warning-name>"
main:
47, Generating copyout(r[:n]) [if not already present]
Generating create(s) [if not already present]
Generating copyin(a[:n]) [if not already present]
Generating create(c) [if not already present]
Loop is parallelizable
Generating implicit private(s,c)
Generating NVIDIA GPU code
47, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
10000000 iterations completed
33198 microseconds on GPU
320570 microseconds on host
I want to know why it causes timing errors?