I’m running some simple code using SSE intrinsics to get the hang of the things. And I’m having some performance problems. For a simple calculation: \vector{r} = \sqrt( \vector{x}.^2 + \vector{y}.^2 ) the version with SSE intrinsics is actually running ~2x slower.
I’m pretty sure it’s not a simple coding issue (althought it may be a simple versioning issue?). Running the same source with gcc, the SSE version runs ~40% faster (about what I was expecting).
Is there something different that I need to do with pgcc to get the expected behavior? Any thoughts appreciated.
FYI, I’m using: pgcc 7.0-7 64-bit target on x86-64 Linux
Code snippets, compilation commands, and outputs below:
[delaquil@head Cforpost]$ tail -14 Cmain.c
start = clock();
ComputeArrayC( pArray1, pArray2, pResult, nSize );
end = clock();
cpu_time = ((double) (end - start)) / CLOCKS_PER_SEC;
printf ("Regular C elapsed time: %f seconds.\n", cpu_time );
start = clock();
ComputeArrayCSSE( pArray1SSE, pArray2SSE, pResultSSE, nSize );
end = clock();
cpu_time = ((double) (end - start)) / CLOCKS_PER_SEC;
printf ("C with SSE elapsed time: %f seconds.\n", cpu_time );
}
[delaquil@head Cforpost]$ /bin/cat Ctest.c
#include <math.h>
void ComputeArrayC(
float* pArray1,
float* pArray2,
float* pResult,
int nSize)
{
int i;
float* pSource1 = pArray1;
float* pSource2 = pArray2;
float* pDest = pResult;
for ( i = 0; i < nSize; i++ )
{
*pDest = (float)sqrt((*pSource1) * (*pSource1) +
(*pSource2) * (*pSource2)) + 0.5f;
pSource1++;
pSource2++;
pDest++;
}
}
[delaquil@head Cforpost]$ /bin/cat CtestSSE.c
#include<xmmintrin.h>
void ComputeArrayCSSE(
float* pArray1,
float* pArray2,
float* pResult,
int nSize)
{
int i;
int nLoop = nSize / 4;
__m128 m1, m2, m3, m4;
__m128* pSrc1 = (__m128*) pArray1;
__m128* pSrc2 = (__m128*) pArray2;
__m128* pDest = (__m128*) pResult;
__m128 m0_5 = _mm_set_ps1(0.5f);
for ( i = 0; i < nLoop; i++ )
{
m1 = _mm_mul_ps(*pSrc1, *pSrc1);
m2 = _mm_mul_ps(*pSrc2, *pSrc2);
m3 = _mm_add_ps(m1, m2);
m4 = _mm_sqrt_ps(m3);
*pDest = _mm_add_ps(m4, m0_5);
pSrc1++;
pSrc2++;
pDest++;
}
}
[delaquil@head Cforpost]$ pgcc -O4 -Mvect=sse Cmain.c Ctest.c CtestSSE.c aligned_malloc.c
Cmain.c:
Ctest.c:
CtestSSE.c:
aligned_malloc.c:
[delaquil@head Cforpost]$ ./a.out
Regular C elapsed time: 0.140000 seconds.
C with SSE elapsed time: 0.310000 seconds.
[delaquil@head Cforpost]$ gcc -lm -O4 -msse -msse2 Cmain.c Ctest.c CtestSSE.c aligned_malloc.c
[delaquil@head Cforpost]$ ./a.out
Regular C elapsed time: 0.100000 seconds.
C with SSE elapsed time: 0.060000 seconds.
[delaquil@head Cforpost]$
[/code]