How to using VFPv4 like NEON intrinsics on Jetson TX2 ? Using NEON intrinsics on Jetson TX2,have the float Vector multiply, it makes the calculate quickly, But only support float and int calculate , and no support division calculate; If has VFPv4 intrinsics supported on Jetson TX2, can do double division calculate ??
Sorry I am not quite family with NEON, but I get below NEON test code for TX2 hope this can help.
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#ifdef INLINE
#undef INLINE
#endif
#ifdef FORCE_INLINE
#define INLINE inline __attribute__((always_inline))
#else
#define INLINE
#endif
static double GetTimeMS(void)
{
#if 0 /* can't seem to link this statically */
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (double)ts.tv_sec * 1000.0 + (double)ts.tv_nsec / 1000000.0;
#else /* this works just the same */
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
#endif
}
typedef union {
float m[16];
} Matrix;
#if 0
static void SetMatrixf(
Matrix *m,
float m00, float m01, float m02, float m03,
float m10, float m11, float m12, float m13,
float m20, float m21, float m22, float m23,
float m30, float m31, float m32, float m33
)
{
m->m[0] = m00;
m->m[1] = m01;
m->m[2] = m02;
m->m[3] = m03;
m->m[4] = m10;
m->m[5] = m11;
m->m[6] = m12;
m->m[7] = m13;
m->m[8] = m20;
m->m[9] = m21;
m->m[10] = m22;
m->m[11] = m23;
m->m[12] = m30;
m->m[13] = m31;
m->m[14] = m32;
m->m[15] = m33;
}
static void PrintMatrix(
Matrix *m
)
{
int i, j;
for (i = 0; i < 4; i++) {
for (j = 0; j < 4; j++) {
printf("%f ", m->m[i*4+j]);
}
printf("\n");
}
}
#endif
static void SetRandomMatrix(
Matrix *m
)
{
int i;
for (i = 0; i < 16; i++) {
m->m[i] = (float)(rand() % (16*100))/100.0f;
}
}
#ifdef USE_CLANG
// C implementation.
INLINE void MultiplyMatrix(
Matrix *__restrict dst,
Matrix *__restrict mm0,
Matrix *__restrict mm1
)
{
float *m0 = mm0->m;
float *m1 = mm1->m;
dst->m[0] = m1[0] * m0[0] + m1[1] * m0[4] + m1[2] * m0[8] + m1[3] * m0[12];
dst->m[1] = m1[0] * m0[1] + m1[1] * m0[5] + m1[2] * m0[9] + m1[3] * m0[13];
dst->m[2] = m1[0] * m0[2] + m1[1] * m0[6] + m1[2] * m0[10] + m1[3] * m0[14];
dst->m[3] = m1[0] * m0[3] + m1[1] * m0[7] + m1[2] * m0[11] + m1[3] * m0[15];
dst->m[4] = m1[4] * m0[0] + m1[5] * m0[4] + m1[6] * m0[8] + m1[7] * m0[12];
dst->m[5] = m1[4] * m0[1] + m1[5] * m0[5] + m1[6] * m0[9] + m1[7] * m0[13];
dst->m[6] = m1[4] * m0[2] + m1[5] * m0[6] + m1[6] * m0[10] + m1[7] * m0[14];
dst->m[7] = m1[4] * m0[3] + m1[5] * m0[7] + m1[6] * m0[11] + m1[7] * m0[15];
dst->m[8] = m1[8] * m0[0] + m1[9] * m0[4] + m1[10] * m0[8] + m1[11] * m0[12];
dst->m[9] = m1[8] * m0[1] + m1[9] * m0[5] + m1[10] * m0[9] + m1[11] * m0[13];
dst->m[10] = m1[8] * m0[2] + m1[9] * m0[6] + m1[10] * m0[10] + m1[11] * m0[14];
dst->m[11] = m1[8] * m0[3] + m1[9] * m0[7] + m1[10] * m0[11] + m1[11] * m0[15];
dst->m[12] = m1[12] * m0[0] + m1[13] * m0[4] + m1[14] * m0[8] + m1[15] * m0[12];
dst->m[13] = m1[12] * m0[1] + m1[13] * m0[5] + m1[14] * m0[9] + m1[15] * m0[13];
dst->m[14] = m1[12] * m0[2] + m1[13] * m0[6] + m1[14] * m0[10] + m1[15] * m0[14];
dst->m[15] = m1[12] * m0[3] + m1[13] * m0[7] + m1[14] * m0[11] + m1[15] * m0[15];
}
#endif
#ifdef USE_NEON64
INLINE void MultiplyMatrix(
Matrix *dst,
Matrix *m0,
Matrix *m1
)
{
// s0-s15: matrix.
// s16-s19: vector.
// s20-s23: result.
__asm__ volatile
(
//"vldmia %2, { s0-s15 } \n\t"
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2]\n\t"
// M[0-3];
//"vldmia %1, { s16-s19 } \n\t"
"ld1 {v4.4s}, [%1]\n\t"
"add %1, %1, #16\n\t"
"fmul s20, s0, v4.s[0]\n\t"
"fmul s24, s1, v4.s[0]\n\t"
"fmul s22, s2, v4.s[0]\n\t"
"fmul s23, s3, v4.s[0]\n\t"
"fmla s20, s4, v4.s[1]\n\t"
"fmla s21, s5, v4.s[1]\n\t"
"fmla s22, s6, v4.s[1]\n\t"
"fmla s23, s7, v4.s[1]\n\t"
"fmla s20, s8, v4.s[2]\n\t"
"fmla s21, s9, v4.s[2]\n\t"
"fmla s22, s10, v4.s[2]\n\t"
"fmla s23, s11, v4.s[2]\n\t"
"fmla s20, s12, v4.s[3]\n\t"
"fmla s21, s13, v4.s[3]\n\t"
"fmla s22, s14, v4.s[3]\n\t"
"fmla s23, s15, v4.s[3]\n\t"
//"vstmia %0, { s20-s23 }\n\t"
"st1 {v5.4s}, [%0]\n\t"
"add %0, %0, #16\n\t"
// M[4-7];
//"vldmia %1, { s16-s19 } \n\t"
"ld1 {v4.4s}, [%1]\n\t"
"add %1, %1, #16\n\t"
"fmul s20, s0, v4.s[0]\n\t"
"fmul s21, s1, v4.s[0]\n\t"
"fmul s22, s2, v4.s[0]\n\t"
"fmul s23, s3, v4.s[0]\n\t"
"fmla s20, s4, v4.s[1]\n\t"
"fmla s21, s5, v4.s[1]\n\t"
"fmla s22, s6, v4.s[1]\n\t"
"fmla s23, s7, v4.s[1]\n\t"
"fmla s20, s8, v4.s[2]\n\t"
"fmla s21, s9, v4.s[2]\n\t"
"fmla s22, s10, v4.s[2]\n\t"
"fmla s23, s11, v4.s[2]\n\t"
"fmla s20, s12, v4.s[3]\n\t"
"fmla s21, s13, v4.s[3]\n\t"
"fmla s22, s14, v4.s[3]\n\t"
"fmla s23, s15, v4.s[3]\n\t"
//"vstmia %0, { s20-s23 }\n\t"
"st1 {v5.4s}, [%0]\n\t"
"add %0, %0, #16\n\t"
// M[8-11];
//"vldmia %1, { s16-s19 } \n\t"
"ld1 {v4.4s}, [%1]\n\t"
"add %1, %1, #16\n\t"
"fmul s20, s0, v4.s[0]\n\t"
"fmul s21, s1, v4.s[0]\n\t"
"fmul s22, s2, v4.s[0]\n\t"
"fmul s23, s3, v4.s[0]\n\t"
"fmla s20, s4, v4.s[1]\n\t"
"fmla s21, s5, v4.s[1]\n\t"
"fmla s22, s6, v4.s[1]\n\t"
"fmla s23, s7, v4.s[1]\n\t"
"fmla s20, s8, v4.s[2]\n\t"
"fmla s21, s9, v4.s[2]\n\t"
"fmla s22, s10, v4.s[2]\n\t"
"fmla s23, s11, v4.s[2]\n\t"
"fmla s20, s12, v4.s[3]\n\t"
"fmla s21, s13, v4.s[3]\n\t"
"fmla s22, s14, v4.s[3]\n\t"
"fmla s23, s15, v4.s[3]\n\t"
//"vstmia %0, { s20-s23 }\n\t"
"st1 {v5.4s}, [%0]\n\t"
"add %0, %0, #16\n\t"
// M[12-15];
//"vldmia %1, { s16-s19 } \n\t"
"ld1 {v4.4s}, [%1]\n\t"
"fmul s20, s0, v4.s[0]\n\t"
"fmul s21, s1, v4.s[0]\n\t"
"fmul s22, s2, v4.s[0]\n\t"
"fmul s23, s3, v4.s[0]\n\t"
"fmla s20, s4, v4.s[1]\n\t"
"fmla s21, s5, v4.s[1]\n\t"
"fmla s22, s6, v4.s[1]\n\t"
"fmla s23, s7, v4.s[1]\n\t"
"fmla s20, s8, v4.s[2]\n\t"
"fmla s21, s9, v4.s[2]\n\t"
"fmla s22, s10, v4.s[2]\n\t"
"fmla s23, s11, v4.s[2]\n\t"
"fmla s20, s12, v4.s[3]\n\t"
"fmla s21, s13, v4.s[3]\n\t"
"fmla s22, s14, v4.s[3]\n\t"
"fmla s23, s15, v4.s[3]\n\t"
//"vstmia %0, { s20-s23 }\n\t"
"st1 {v5.4s}, [%0]\n\t"
: // no output.
: "r" (dst->m), "r" (m1->m), "r" (m0->m) // input.
: "memory",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6", "s7",
"s8", "s9", "s10", "s11",
"s12", "s13", "s14", "s15",
"s16", "s17", "s18", "s19",
"s20", "s21", "s22", "s23"
// clobber.
);
}
#endif
#ifdef USE_NEON64_Quad
INLINE void MultiplyMatrix(
Matrix *dst,
Matrix *m0,
Matrix *m1
)
{
// s0-s15: matrix.
// s16-s19: vector.
// s20-s23: result.
__asm__ volatile
(
"ld1 {v0.4s, v1.4s}, [%2], #32\n\t"
"ld1 {v2.4s, v3.4s}, [%2]\n\t"
"ld1 {v4.4s, v5.4s}, [%1], #32\n\t"
"fmul v8.4s, v0.4s, v4.s[0]\n\t"
"fmla v8.4s, v1.4s, v4.s[1]\n\t"
"fmla v8.4s, v2.4s, v4.s[2]\n\t"
"fmla v8.4s, v3.4s, v4.s[3]\n\t"
"ld1 {v6.4s, v7.4s}, [%1], #32\n\t"
"fmul v9.4s, v0.4s, v5.s[0]\n\t"
"fmla v9.4s, v1.4s, v5.s[1]\n\t"
"fmla v9.4s, v2.4s, v5.s[2]\n\t"
"fmla v9.4s, v3.4s, v5.s[3]\n\t"
"st1 {v8.4s, v9.4s}, [%0], #32\n\t"
"fmul v10.4s, v0.4s, v6.s[0]\n\t"
"fmla v10.4s, v1.4s, v6.s[1]\n\t"
"fmla v10.4s, v2.4s, v6.s[2]\n\t"
"fmla v10.4s, v3.4s, v6.s[3]\n\t"
"fmul v11.4s, v0.4s, v7.s[0]\n\t"
"fmla v11.4s, v1.4s, v7.s[1]\n\t"
"fmla v11.4s, v2.4s, v7.s[2]\n\t"
"fmla v11.4s, v3.4s, v7.s[3]\n\t"
"st1 {v10.4s, v11.4s}, [%0]\n\t"
: // no output.
: "r" (dst->m), "r" (m1->m), "r" (m0->m) // input.
: "memory",
"v0", "v1", "v2", "v3",
"v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11"
// clobber.
);
}
#endif
static int CompareMatrix(
const Matrix *m0,
const Matrix *m1
)
{
int i;
for (i = 0; i < 16; i++) {
if (fabsf(m0->m[i] - m1->m[i]) > 0.001f) {
return 0;
}
}
return 1;
}
#ifdef TARGETED_UNR
__attribute__((optimize("unroll-loops")))
#endif
static void MeasureTimeMultiplyMatrix(
Matrix *dst,
Matrix *m0,
Matrix *m1,
int mat_num,
int loop_num,
const char *name
)
{
double startTime, endTime;
int i, j;
startTime = GetTimeMS();
for (j = 0; j < loop_num; j++) {
for (i = 0; i < mat_num; i++) {
#if defined(DO_PREFETCH)
__builtin_prefetch(&m0[i + 12].m[0]);
__builtin_prefetch(&m0[i + 12].m[8]);
__builtin_prefetch(&m1[i + 12].m[0]);
__builtin_prefetch(&m1[i + 12].m[8]);
#endif
MultiplyMatrix(&dst[i], &m0[i], &m1[i]);
}
}
endTime = GetTimeMS();
printf("%s(%dx%d): %g msec\n", name, mat_num, loop_num, (endTime - startTime));
}
#if defined(USE_ALTIVEC) || defined(USE_NEON)
#define MALLOC(size) memalign(16, size)
#else
#define MALLOC(size) malloc(size)
#endif
static void CompareResult(const Matrix *res0, const Matrix *res1, int mat_num, const char *name)
{
int i;
int diff = 0;
for (i = 0; i < mat_num; i++) {
if (!CompareMatrix(&res0[i], &res1[i])) {
diff++;
}
}
if (diff) {
printf("diff(%s)=%d\n", name, diff);
}
}
static void LoadTest(int mat_num, int loop_num)
{
Matrix *m0, *m1, *res0, *res1, *res2, *res3, *res4;
int i;
int mat_num2 = mat_num;
//printf("arg: %d, %d, %d\n", mat_num, loop_num, sizeof(Matrix));
m0 = (Matrix*)MALLOC(sizeof(Matrix) * mat_num2);
m1 = (Matrix*)MALLOC(sizeof(Matrix) * mat_num2);
res0 = (Matrix*)MALLOC(sizeof(Matrix) * mat_num2);
res1 = (Matrix*)MALLOC(sizeof(Matrix) * mat_num2);
res2 = (Matrix*)MALLOC(sizeof(Matrix) * mat_num2);
res3 = (Matrix*)MALLOC(sizeof(Matrix) * mat_num2);
res4 = (Matrix*)MALLOC(sizeof(Matrix) * mat_num2);
// create random matrix.
for (i = 0; i < mat_num; i++) {
SetRandomMatrix(&m0[i]);
SetRandomMatrix(&m1[i]);
}
printf("--------------------------------------------\n");
// calculate multiply matrix.
// for cache prefech.
#ifdef USE_CLANG
const char *name = "MultiplyMatrix_CLang";
#endif
#ifdef USE_NEON64
const char *name = "MultiplyMatrix_NEON64";
#endif
#ifdef USE_NEON64_Quad
const char *name = "MultiplyMatrix_NEON64_Quad";
#endif
MeasureTimeMultiplyMatrix(res2, m0, m1, mat_num, loop_num, name);
free(m0);
free(m1);
free(res0);
free(res1);
free(res2);
free(res3);
free(res4);
}
int main(int argc, char **argv)
{
#if 1
LoadTest(100,5000000);
#else
LoadTest(1000,1000);
#endif
return 0;
}
It’s enough to specify the options below when you build. It’ll make the compiler use Advanced SIMD (aka NEON) automatically.
-march=armv8-a -mcpu=cortex-a57 -O3
or
-march=armv8-a -mcpu=cortex-a57 -O2 -ftree-vectorize
since Denver emulates A57, above applies as it is to Denver.
If I set the programme running on the Denver2 CPU as flow:
CPU_ZERO(&mask); //置空
CPU_SET(1, &mask); //设置亲和力值
sched_setaffinity(0, sizeof(mask), &mask);//设置线程CPU亲和力
If I also need to build the programme with -march=armv8-a -mcpu=cortex-a57 -O3 or -march=armv8-a -mcpu=cortex-a57 -O2 -ftree-vectorize ??