Hello:
I’m new to PGI compilers and I’m testing the Community edition using the file shown at the end of this message. There is a rough matrix multiplication implementation in C, but is intended only for testing purposes. My computer is a Lenovo W540 laptop running Debian and has a 4 core Intel Core i7-4800MQ 2.7 GHz processor. I checked my program serial and using OpenMP with all the 4 cores activated (without hyperthreading) with the compilers:
GCC 6.3.0, using the flags -O3 and -fopenmp for parallel execution
ICC 14.0.3 using the flags -O3 and -openmp for parallel execution
PATHCC 16.10 using the flags -fast and -mp for parallel execution
Now, the times in seconds I’ve obtained (average of three executions):
serial 4 cores speedup
gcc 34.1 20.1 1.70
icc 35.9 10.5 3.42
pgcc 75.0 21.4 3.50
I’m a bit confused about the times of the PGI compiler. While the serial times for gcc and icc are almost the same, the PGI is more than twice worse. However, the PGI OpenMP speedup is the best of the three compilers.
I’ve obtained almost the same times with PGI using the optimization flags -O2, -O2, and -fast -Mipa=inline,fast. Is this behavior in serial execution normal with PGI? Should I use any other optimization flags?
Thanks
#include<stdio.h>
#include<stdlib.h>
#include <sys/time.h>
#define gettime(a) gettimeofday(a,NULL)
#define usec(t1,t2) (((t2).tv_sec-(t1).tv_sec)*1000000+((t2).tv_usec-(t1).tv_usec))
typedef struct timeval timestruct;
#define SIZE 2000
int main()
{
size_t i=0,j=0,k=0;
size_t posA=0,posB=0,posC=0;
timestruct t1,t2;
double* A=(double*)malloc(SIZE*SIZE*sizeof(double));
double* B=(double*)malloc(SIZE*SIZE*sizeof(double));
double* C=(double*)malloc(SIZE*SIZE*sizeof(double));
double sum=0.0;
for(i=0;i<SIZE*SIZE;i++)
{
A[i] = (double)i+10.0;
B[i] = (double)i+10.0;
}
gettime(&t1);
#pragma omp parallel for default(none) \
shared(C,A,B) \
private(i,j,posC,sum,k,posA,posB)
for(i=0;i<SIZE;i++)
{
for(j=0;j<SIZE;j++)
{
posC = i*SIZE+j;
sum = 0.0;
for(k=0;k<SIZE;k++)
{
posA = i*SIZE+k;
posB = k*SIZE+j;
sum += A[posA]*B[posB];
}
C[posC] = sum;
}
}
gettime(&t2);
printf("Time spent: %.5lf seconds\n",(double)usec(t1,t2)/1000000.0);
free(A);
free(B);
free(C);
return 0;
}