cudaEvent_t not working with OpenACC with PGI 15.4 compiler

I am trying to compile a code with cudaEvent_t to get better accuracy in timing my code. I dont know why my code is not compiling properly, I need help figuring this error.


[sms1990@comet-ln3 .solution]$ pgcc -o nestedpar_C nestedparallel.c -Minfo=accel -ta=nvidia,time,cc3x -O3 -acc -fast -Minline -lcuda
PGC-S-0040-Illegal use of symbol, cudaEvent_t (nestedparallel.c: 25)
PGC-W-0156-Type not specified, ‘int’ assumed (nestedparallel.c: 25)
PGC-S-0037-Syntax error: Recovery attempted by deleting identifier cudaEventDestroy (nestedparallel.c: 61)
PGC-S-0056-Attempt to call non-function (nestedparallel.c: 61)
PGC/x86-64-Extractor Linux 15.4-0: completed with severe errors




1 #include <stdio.h>
  2 #include <stdlib.h>
  3 #include <time.h>
  4 #include <openacc.h>
  5 //#include <cuda.h>
  6 #pragma acc routine vector
  7 void matvec(float * restrict v, float * restrict x,  float * restrict a, int i, int n)
  8 {
  9   float sum = 0;
 10   #pragma acc loop vector reduction(+:sum)
 11   for (int j = 0; j < n; j++)
 12     sum += a[i*n+j]*v[j];
 13 
 14   x[i] = sum;
 15 }
 16 int main()
 17 {
 18 #ifdef _OPENACC
 19   acc_init(acc_device_nvidia);
 20 #endif
 21 
 22   int N = 20000;
 23   int i,j;
 24   float *A, *V, *X;
 25   cudaEvent_t t1, t2;
 26 
 27   double start,end,time;
 28   double MFLOPS;
 29   A = (float*)malloc(sizeof(float)*N*N);
 30   V = (float*)malloc(sizeof(float)*N);
 31   X = (float*)malloc(sizeof(float)*N);
 32 
 33   cudaEventCreate(&t1);
 34   cudaEventCreate(&t2);
 35 
 36   printf("Matrix Vector Multiplication of [%d,%d] X [%d]\n ", N,N,N);
 37   cudaEventRecord(t1,0);
 38   #pragma acc enter data create(A[0:N*N],V[0:N],X[0:N]) 
 39 
 40     #pragma acc parallel loop gang present(A[0:N*N],V[0:N],X[0:N]) 
 41     for (i = 0; i < N; i++)
 42     {
 43       #pragma acc loop vector
 44       for (j = 0; j < N; j++)
 45         A[i*N+j] = i+j;
 46 
 47       V[i] = i+j;
 48     }
 49 
 50     #pragma acc parallel loop num_gangs(512) vector_length(192) present(A[0:N*N],V[0:N],X[0:N]) 
 51     for (i = 0; i < N; i++)
 52     {
 53       matvec(V,X,A,i,N);
 54     }
 55 
 56   #pragma acc exit data copyout(A[0:N*N],V[0:N],X[0:N])
 57   cudaEventRecord(t2,0);
 58   cudaEventSynchronize(t2);
 59   float time_cuda = 0;
 60   cudaEventElapsedTime(&time_cuda,t1,t2)
 61   cudaEventDestroy(t1);
 62   cudaEventDestroy(t2);
 63 //  time = (end-start)/CLOCKS_PER_SEC;
 64   MFLOPS = (N*N) / (1E6 * (double)time_cuda);
 65   printf("Execution: %f sec \n", time_cuda);
 66   printf("MFLOPS/sec : %f\n", MFLOPS);
 67 #ifdef debug
 68   for (i = 0; i < N; i++)
 69   {
 70     for (j = 0; j < N; j++)
 71     {
 72         printf("%0.3f ", A[i*N+j]);
 73     }
 74     printf("\t");
 75     for( j = i; j <= i; j++)
 76     {
 77         printf("%f \t %f ", V[j],X[j]);
 78     }
 79     printf("\n");
 80   }
 81 #endif
 82   return 0;
 83 }

Hi Shahzeb,

The error is because “cudaEvent_t” isn’t defined. It is defined in “cuda_runtime.h” so including this file will remove this error.

However this header file is only configured to use nvcc or a GNU compiler so will give you an unsupported compiler error message when compiled with pgcc. You can use pgc++, which is GNU compatible.

Note that cudaEventElapsedTime returns the time in ms, not seconds. Divide the result by 1000 to get the seconds.

Hope this helps,
Mat

% cat test_051415.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#ifdef _OPENACC
#include <openacc.h>
#endif
#include <cuda_runtime.h>
#pragma acc routine vector
void matvec(float * v, float * x,  float * a, int i, int n)
{
  float sum = 0;
  int j;
  #pragma acc loop vector reduction(+:sum)
  for (j = 0; j < n; j++)
    sum += a[i*n+j]*v[j];

  x[i] = sum;
}
int main()
{
#ifdef _OPENACC
  acc_init(acc_device_nvidia);
#endif

  int N = 20000;
  int i,j;
  float *A, *V, *X;
  cudaEvent_t t1, t2;

  double start,end,time;
  double MFLOPS;
  A = (float*)malloc(sizeof(float)*N*N);
  V = (float*)malloc(sizeof(float)*N);
  X = (float*)malloc(sizeof(float)*N);

  cudaEventCreate(&t1);
  cudaEventCreate(&t2);

  printf("Matrix Vector Multiplication of [%d,%d] X [%d]\n ", N,N,N);
  cudaEventRecord(t1,0);
  #pragma acc enter data create(A[0:N*N],V[0:N],X[0:N])

    #pragma acc parallel loop gang present(A[0:N*N],V[0:N],X[0:N])
    for (i = 0; i < N; i++)
    {
      #pragma acc loop vector
      for (j = 0; j < N; j++)
        A[i*N+j] = i+j;

      V[i] = i+j;
    }

    #pragma acc parallel loop num_gangs(512) vector_length(192) present(A[0:N*N],V[0:N],X[0:N])
    for (i = 0; i < N; i++)
    {
      matvec(V,X,A,i,N);
    }

  #pragma acc exit data copyout(A[0:N*N],V[0:N],X[0:N])
  cudaEventRecord(t2,0);
  cudaEventSynchronize(t2);
  float time_cuda = 0;
  cudaEventElapsedTime(&time_cuda,t1,t2);
  cudaEventDestroy(t1);
  cudaEventDestroy(t2);
  time_cuda/=1000;
//  time = (end-start)/CLOCKS_PER_SEC;
  MFLOPS = (N*N) / (1E6 * (double)time_cuda);
  printf("Execution: %f sec \n", time_cuda);
  printf("MFLOPS/sec : %f\n", MFLOPS);
#ifdef debug
  for (i = 0; i < N; i++)
  {
    for (j = 0; j < N; j++)
    {
        printf("%0.3f ", A[i*N+j]);
    }
    printf("\t");
    for( j = i; j <= i; j++)
    {
        printf("%f \t %f ", V[j],X[j]);
    }
    printf("\n");
  }
#endif
  return 0;
}
% pgc++ -Mcuda -acc -fast -pgf90libs test_051415.c -w
% a.out
Matrix Vector Multiplication of [20000,20000] X [20000]
 Execution: 0.609851 sec
MFLOPS/sec : 655.897657