Hi,
I am trying to compile some code which uses SSE intrinsics, however the code becomes slower than the non-SSE version.
pgcc 7.2-5 64-bit target on x86-64 Linux -tp k8-64e
Copyright 1989-2000, The Portland Group, Inc. All Rights Reserved.
Copyright 2000-2008, STMicroelectronics, Inc. All Rights Reserved.
I examined the object code generated and it appears that it is doing many unnecessary operations etc. for example see the bold lines below…
40178b: 66 0f 28 8c 24 c0 01 movapd 0x1c0(%rsp),%xmm1
401792: 00 00
401794: 66 0f 58 8c 24 b0 01 addpd 0x1b0(%rsp),%xmm1
40179b: 00 00
40179d: 66 0f 29 8c 24 c0 01 > movapd %xmm1,0x1c0(%rsp)
4017a4: 00 00
4017a6: 66 0f 28 8c 24 c0 01 > movapd 0x1c0(%rsp),%xmm1
4017ad: 00 00
4017af: 66 0f 29 8c 24 20 04 movapd %xmm1,0x420(%rsp)
4017b6: 00 00
Also in a test code I have introduced division by 1 and it wasnt removed by the PGI compiler. In addition to these, the compiler was only utilizing 2-3 of the 16 SSE registers and it was copying data to memory and back all the time.
To further test this, I have compiled the code with GCC.
gcc version 4.1.2 20070115 (prerelease) (SUSE Linux)
, GCC generated a more efficient and sleek result. While the test code I used is quite simple so GCC also uses few registers only, the original program code is much more demanding and it becomes about 10 times slower when PGI compiler is only using 2 SSE registers while GCC utilizes all 16 registers.
The test code is below, can you let me know why this is happening with PGI compiler? Is this a bug or a known inefficiency?
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#ifdef __SSE2__
#include <emmintrin.h> /* Header file for SSE2 */
#endif
int main (int argc,char** args) {
int i,size;
double *a,*a2,*a3,*b,*b2,*b3,*c;
__m128d mm_a1,mm_a2,mm_a3,mm_a4,mm_b1,mm_b2,mm_b3,mm_b4,mm_temp,mm_c;
size=8192;
a = (double*)calloc(size,sizeof(double));
a2 = (double*)calloc(size,sizeof(double));
a3 = (double*)calloc(size,sizeof(double));
b = (double*)calloc(size,sizeof(double));
b2 = (double*)calloc(size,sizeof(double));
b3 = (double*)calloc(size,sizeof(double));
c = (double*)calloc(size,sizeof(double));
mm_temp = _mm_set1_pd(1);
for(i=0;i<size-1;i+=2) {
a[i]=a[i+1]=a2[i]=a2[i+1]=a3[i]=a3[i+1]=i;
b[i]=b[i+1]=b2[i]=b2[i+1]=b3[i]=b3[i+1]=i;
}
for(i=0;i<size-1;i+=2) {
mm_a4 = _mm_set1_pd(0);
mm_a1 = _mm_load_pd(&a[i]);
mm_a2 = _mm_load_pd(&a2[i]);
mm_a3 = _mm_load_pd(&a3[i]);
mm_a4 = _mm_add_pd(mm_a4,mm_a1);
mm_a4 = _mm_add_pd(mm_a4,mm_a2);
mm_a4 = _mm_add_pd(mm_a4,mm_a3);
mm_b4 = _mm_set1_pd(0);
mm_b1 = _mm_load_pd(&b[i]);
mm_b2 = _mm_load_pd(&b2[i]);
mm_b3 = _mm_load_pd(&b3[i]);
mm_b4 = _mm_add_pd(mm_b4,mm_a1);
mm_b4 = _mm_add_pd(mm_b4,mm_a2);
mm_b4 = _mm_add_pd(mm_b4,mm_a3);
mm_a1 = _mm_div_pd(mm_a1,mm_temp);
mm_a2 = _mm_div_pd(mm_a2,mm_temp);
mm_a3 = _mm_div_pd(mm_a3,mm_temp);
mm_b1 = _mm_div_pd(mm_b1,mm_temp);
mm_b2 = _mm_div_pd(mm_b2,mm_temp);
mm_b3 = _mm_div_pd(mm_b3,mm_temp);
mm_c = _mm_set1_pd(0);
mm_c = _mm_add_pd(mm_c,mm_a1);
mm_c = _mm_add_pd(mm_c,mm_a2);
mm_c = _mm_add_pd(mm_c,mm_a3);
mm_c = _mm_add_pd(mm_c,mm_b1);
mm_c = _mm_add_pd(mm_c,mm_b2);
mm_c = _mm_add_pd(mm_c,mm_b3);
mm_c = _mm_mul_pd(mm_c,mm_a4);
mm_c = _mm_mul_pd(mm_c,mm_b4);
_mm_store_pd(&c[i],mm_c);
}
for(i=0;i<size-1;i+=2) {
printf("C %f %f\n",c[i],c[i+1]);
}
free(a);
free(a2);
free(a3);
free(b);
free(b2);
free(b3);
free(c);
}