How to using VFPv4 like NEON intrinsics on Jetson TX2 ?

How to using VFPv4 like NEON intrinsics on Jetson TX2 ? Using NEON intrinsics on Jetson TX2,have the float Vector multiply, it makes the calculate quickly, But only support float and int calculate , and no support division calculate; If has VFPv4  intrinsics supported on Jetson TX2, can do double division calculate ??

Sorry I am not quite family with NEON, but I get below NEON test code for TX2 hope this can help.

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>

#ifdef INLINE
#undef INLINE
#endif

#ifdef FORCE_INLINE
#define INLINE  inline __attribute__((always_inline))
#else
#define INLINE
#endif

static double GetTimeMS(void)
{
#if 0 /* can't seem to link this statically */
	struct timespec ts;
	clock_gettime(CLOCK_MONOTONIC, &ts);
	return (double)ts.tv_sec * 1000.0 + (double)ts.tv_nsec / 1000000.0;
#else /* this works just the same */
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
#endif
}

typedef union {
	float	m[16];
} Matrix;

#if 0
static void SetMatrixf(
	Matrix *m,
	float m00, float m01, float m02, float m03,
	float m10, float m11, float m12, float m13,
	float m20, float m21, float m22, float m23,
	float m30, float m31, float m32, float m33
)
{
	m->m[0] = m00;
	m->m[1] = m01;
	m->m[2] = m02;
	m->m[3] = m03;

	m->m[4] = m10;
	m->m[5] = m11;
	m->m[6] = m12;
	m->m[7] = m13;

	m->m[8] = m20;
	m->m[9] = m21;
	m->m[10] = m22;
	m->m[11] = m23;
	
	m->m[12] = m30;
	m->m[13] = m31;
	m->m[14] = m32;
	m->m[15] = m33;
}

static void PrintMatrix(
	Matrix *m
)
{
	int i, j;
	for (i = 0; i < 4; i++) {
		for (j = 0; j < 4; j++) {
			printf("%f ", m->m[i*4+j]);
		}
		printf("\n");
	}
}
#endif

static void SetRandomMatrix(
	Matrix *m
)
{
	int i;
	for (i = 0; i < 16; i++) {
		m->m[i] = (float)(rand() % (16*100))/100.0f;
	}
}

#ifdef USE_CLANG
// C implementation.
INLINE void MultiplyMatrix(
	Matrix *__restrict dst,
	Matrix *__restrict mm0,
	Matrix *__restrict mm1
)
{
	float *m0 = mm0->m;
	float *m1 = mm1->m;
	
	dst->m[0] = m1[0] * m0[0] + m1[1] * m0[4] + m1[2] * m0[8] + m1[3] * m0[12];
	dst->m[1] = m1[0] * m0[1] + m1[1] * m0[5] + m1[2] * m0[9] + m1[3] * m0[13];
	dst->m[2] = m1[0] * m0[2] + m1[1] * m0[6] + m1[2] * m0[10] + m1[3] * m0[14];
	dst->m[3] = m1[0] * m0[3] + m1[1] * m0[7] + m1[2] * m0[11] + m1[3] * m0[15];
		
	dst->m[4] = m1[4] * m0[0] + m1[5] * m0[4] + m1[6] * m0[8] + m1[7] * m0[12];
	dst->m[5] = m1[4] * m0[1] + m1[5] * m0[5] + m1[6] * m0[9] + m1[7] * m0[13];
	dst->m[6] = m1[4] * m0[2] + m1[5] * m0[6] + m1[6] * m0[10] + m1[7] * m0[14];
	dst->m[7] = m1[4] * m0[3] + m1[5] * m0[7] + m1[6] * m0[11] + m1[7] * m0[15];
	
	dst->m[8] = m1[8] * m0[0] + m1[9] * m0[4] + m1[10] * m0[8] + m1[11] * m0[12];
	dst->m[9] = m1[8] * m0[1] + m1[9] * m0[5] + m1[10] * m0[9] + m1[11] * m0[13];
	dst->m[10] = m1[8] * m0[2] + m1[9] * m0[6] + m1[10] * m0[10] + m1[11] * m0[14];
	dst->m[11] = m1[8] * m0[3] + m1[9] * m0[7] + m1[10] * m0[11] + m1[11] * m0[15];
	
	dst->m[12] = m1[12] * m0[0] + m1[13] * m0[4] + m1[14] * m0[8] + m1[15] * m0[12];
	dst->m[13] = m1[12] * m0[1] + m1[13] * m0[5] + m1[14] * m0[9] + m1[15] * m0[13];
	dst->m[14] = m1[12] * m0[2] + m1[13] * m0[6] + m1[14] * m0[10] + m1[15] * m0[14];
	dst->m[15] = m1[12] * m0[3] + m1[13] * m0[7] + m1[14] * m0[11] + m1[15] * m0[15];
}
#endif

#ifdef USE_NEON64

INLINE void MultiplyMatrix(
	Matrix *dst,
	Matrix *m0,
	Matrix *m1
)
{
	// s0-s15: matrix.
	// s16-s19: vector.
	// s20-s23: result.
	__asm__ volatile
	(
		//"vldmia %2, { s0-s15 } \n\t"
		"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2]\n\t"
		
		// M[0-3];
		//"vldmia %1, { s16-s19 } \n\t"
		"ld1 {v4.4s}, [%1]\n\t"
		"add %1, %1, #16\n\t"
		
		"fmul s20, s0, v4.s[0]\n\t"
		"fmul s24, s1, v4.s[0]\n\t"
		"fmul s22, s2, v4.s[0]\n\t"
		"fmul s23, s3, v4.s[0]\n\t"
		
		"fmla s20, s4, v4.s[1]\n\t"
		"fmla s21, s5, v4.s[1]\n\t"
		"fmla s22, s6, v4.s[1]\n\t"
		"fmla s23, s7, v4.s[1]\n\t"
		
		"fmla s20, s8, v4.s[2]\n\t"
		"fmla s21, s9, v4.s[2]\n\t"
		"fmla s22, s10, v4.s[2]\n\t"
		"fmla s23, s11, v4.s[2]\n\t"
		
		"fmla s20, s12, v4.s[3]\n\t"
		"fmla s21, s13, v4.s[3]\n\t"
		"fmla s22, s14, v4.s[3]\n\t"
		"fmla s23, s15, v4.s[3]\n\t"
		
		//"vstmia %0, { s20-s23 }\n\t"
		"st1 {v5.4s}, [%0]\n\t"
		"add %0, %0, #16\n\t"
		
		// M[4-7];
		//"vldmia %1, { s16-s19 } \n\t"
		"ld1 {v4.4s}, [%1]\n\t"
		"add %1, %1, #16\n\t"
		
		"fmul s20, s0, v4.s[0]\n\t"
		"fmul s21, s1, v4.s[0]\n\t"
		"fmul s22, s2, v4.s[0]\n\t"
		"fmul s23, s3, v4.s[0]\n\t"
		
		"fmla s20, s4, v4.s[1]\n\t"
		"fmla s21, s5, v4.s[1]\n\t"
		"fmla s22, s6, v4.s[1]\n\t"
		"fmla s23, s7, v4.s[1]\n\t"
		
		"fmla s20, s8, v4.s[2]\n\t"
		"fmla s21, s9, v4.s[2]\n\t"
		"fmla s22, s10, v4.s[2]\n\t"
		"fmla s23, s11, v4.s[2]\n\t"
		
		"fmla s20, s12, v4.s[3]\n\t"
		"fmla s21, s13, v4.s[3]\n\t"
		"fmla s22, s14, v4.s[3]\n\t"
		"fmla s23, s15, v4.s[3]\n\t"
		
		//"vstmia %0, { s20-s23 }\n\t"
		"st1 {v5.4s}, [%0]\n\t"
		"add %0, %0, #16\n\t"
		
		// M[8-11];
		//"vldmia %1, { s16-s19 } \n\t"
		"ld1 {v4.4s}, [%1]\n\t"
		"add %1, %1, #16\n\t"
		
		"fmul s20, s0, v4.s[0]\n\t"
		"fmul s21, s1, v4.s[0]\n\t"
		"fmul s22, s2, v4.s[0]\n\t"
		"fmul s23, s3, v4.s[0]\n\t"
		
		"fmla s20, s4, v4.s[1]\n\t"
		"fmla s21, s5, v4.s[1]\n\t"
		"fmla s22, s6, v4.s[1]\n\t"
		"fmla s23, s7, v4.s[1]\n\t"
		
		"fmla s20, s8, v4.s[2]\n\t"
		"fmla s21, s9, v4.s[2]\n\t"
		"fmla s22, s10, v4.s[2]\n\t"
		"fmla s23, s11, v4.s[2]\n\t"
		
		"fmla s20, s12, v4.s[3]\n\t"
		"fmla s21, s13, v4.s[3]\n\t"
		"fmla s22, s14, v4.s[3]\n\t"
		"fmla s23, s15, v4.s[3]\n\t"
		
		//"vstmia %0, { s20-s23 }\n\t"
		"st1 {v5.4s}, [%0]\n\t"
		"add %0, %0, #16\n\t"
		
		// M[12-15];
		//"vldmia %1, { s16-s19 } \n\t"
		"ld1 {v4.4s}, [%1]\n\t"
		
		"fmul s20, s0, v4.s[0]\n\t"
		"fmul s21, s1, v4.s[0]\n\t"
		"fmul s22, s2, v4.s[0]\n\t"
		"fmul s23, s3, v4.s[0]\n\t"
		
		"fmla s20, s4, v4.s[1]\n\t"
		"fmla s21, s5, v4.s[1]\n\t"
		"fmla s22, s6, v4.s[1]\n\t"
		"fmla s23, s7, v4.s[1]\n\t"
		
		"fmla s20, s8, v4.s[2]\n\t"
		"fmla s21, s9, v4.s[2]\n\t"
		"fmla s22, s10, v4.s[2]\n\t"
		"fmla s23, s11, v4.s[2]\n\t"
		
		"fmla s20, s12, v4.s[3]\n\t"
		"fmla s21, s13, v4.s[3]\n\t"
		"fmla s22, s14, v4.s[3]\n\t"
		"fmla s23, s15, v4.s[3]\n\t"
		
		//"vstmia %0, { s20-s23 }\n\t"
		"st1 {v5.4s}, [%0]\n\t"
		
		: // no output.
		: "r" (dst->m), "r" (m1->m), "r" (m0->m)	// input.
		: "memory", 
		"s0", "s1", "s2", "s3", 
		"s4", "s5", "s6", "s7", 
		"s8", "s9", "s10", "s11",
		"s12", "s13", "s14", "s15",
		
		"s16", "s17", "s18", "s19", 
		"s20", "s21", "s22", "s23"
		// clobber.
	);
}

#endif

#ifdef USE_NEON64_Quad

INLINE void MultiplyMatrix(
	Matrix *dst,
	Matrix *m0,
	Matrix *m1
)
{
	// s0-s15: matrix.
	// s16-s19: vector.
	// s20-s23: result.
	__asm__ volatile
	(
		"ld1 {v0.4s, v1.4s}, [%2], #32\n\t"
		"ld1 {v2.4s, v3.4s}, [%2]\n\t"
		
		"ld1 {v4.4s, v5.4s}, [%1], #32\n\t"
		
		"fmul v8.4s, v0.4s, v4.s[0]\n\t"
		"fmla v8.4s, v1.4s, v4.s[1]\n\t"
		"fmla v8.4s, v2.4s, v4.s[2]\n\t"
		"fmla v8.4s, v3.4s, v4.s[3]\n\t"
		
		"ld1 {v6.4s, v7.4s}, [%1], #32\n\t"
		
		"fmul v9.4s, v0.4s, v5.s[0]\n\t"
		"fmla v9.4s, v1.4s, v5.s[1]\n\t"
		"fmla v9.4s, v2.4s, v5.s[2]\n\t"
		"fmla v9.4s, v3.4s, v5.s[3]\n\t"
		
		"st1 {v8.4s, v9.4s}, [%0], #32\n\t"
		
		"fmul v10.4s, v0.4s, v6.s[0]\n\t"
		"fmla v10.4s, v1.4s, v6.s[1]\n\t"
		"fmla v10.4s, v2.4s, v6.s[2]\n\t"
		"fmla v10.4s, v3.4s, v6.s[3]\n\t"
	
		"fmul v11.4s, v0.4s, v7.s[0]\n\t"
		"fmla v11.4s, v1.4s, v7.s[1]\n\t"
		"fmla v11.4s, v2.4s, v7.s[2]\n\t"
		"fmla v11.4s, v3.4s, v7.s[3]\n\t"
		
		"st1 {v10.4s, v11.4s}, [%0]\n\t"
		
		: // no output.
		: "r" (dst->m), "r" (m1->m), "r" (m0->m)	// input.
		: "memory", 
		"v0", "v1", "v2", "v3", 
		"v4", "v5", "v6", "v7", 
		"v8", "v9", "v10", "v11"
		// clobber.
	);
}

#endif

static int CompareMatrix(
	const Matrix *m0,
	const Matrix *m1
)
{
	int i;
	for (i = 0; i < 16; i++) {
		if (fabsf(m0->m[i] - m1->m[i]) > 0.001f) {
			return 0;
		}
	}
	return 1;
}

#ifdef TARGETED_UNR
__attribute__((optimize("unroll-loops")))
#endif
static void MeasureTimeMultiplyMatrix(
	Matrix *dst,
	Matrix *m0,
	Matrix *m1,
	int mat_num,
	int loop_num,
	const char *name
)
{
	double startTime, endTime;
	int i, j;
	
	startTime = GetTimeMS();
	
	for (j = 0; j < loop_num; j++) {
		for (i = 0; i < mat_num; i++) {
#if defined(DO_PREFETCH)
            __builtin_prefetch(&m0[i + 12].m[0]);
            __builtin_prefetch(&m0[i + 12].m[8]);
            __builtin_prefetch(&m1[i + 12].m[0]);
            __builtin_prefetch(&m1[i + 12].m[8]);
#endif
			MultiplyMatrix(&dst[i], &m0[i], &m1[i]);
		}
	}
	
	endTime = GetTimeMS();
	
	printf("%s(%dx%d): %g msec\n", name, mat_num, loop_num, (endTime - startTime));
}

#if defined(USE_ALTIVEC) || defined(USE_NEON)
#define MALLOC(size) memalign(16, size)
#else
#define MALLOC(size) malloc(size)
#endif

static void CompareResult(const Matrix *res0, const Matrix *res1, int mat_num, const char *name)
{
	int i;
	int diff = 0;
	for (i = 0; i < mat_num; i++) {
		if (!CompareMatrix(&res0[i], &res1[i])) {
			diff++;
		}
	}
	if (diff) {
		printf("diff(%s)=%d\n", name, diff);
	}
}
	

static void LoadTest(int mat_num, int loop_num)
{
	Matrix *m0, *m1, *res0, *res1, *res2, *res3, *res4;
	int i;
	int mat_num2 = mat_num;
	
	//printf("arg: %d, %d, %d\n", mat_num, loop_num, sizeof(Matrix));
	
	m0 = (Matrix*)MALLOC(sizeof(Matrix) * mat_num2);
	m1 = (Matrix*)MALLOC(sizeof(Matrix) * mat_num2);
	res0 = (Matrix*)MALLOC(sizeof(Matrix) * mat_num2);
	res1 = (Matrix*)MALLOC(sizeof(Matrix) * mat_num2);
	res2 = (Matrix*)MALLOC(sizeof(Matrix) * mat_num2);
	res3 = (Matrix*)MALLOC(sizeof(Matrix) * mat_num2);
	res4 = (Matrix*)MALLOC(sizeof(Matrix) * mat_num2);


	// create random matrix.
	for (i = 0; i < mat_num; i++) {
		SetRandomMatrix(&m0[i]);
		SetRandomMatrix(&m1[i]);
	}

	printf("--------------------------------------------\n");
	
	// calculate multiply matrix.
	// for cache prefech.
#ifdef USE_CLANG
	const char *name = "MultiplyMatrix_CLang";
#endif

#ifdef USE_NEON64
	const char *name = "MultiplyMatrix_NEON64";
#endif

#ifdef USE_NEON64_Quad
	const char *name = "MultiplyMatrix_NEON64_Quad";
#endif

	MeasureTimeMultiplyMatrix(res2, m0, m1, mat_num, loop_num, name);


	free(m0);
	free(m1);
	free(res0);
	free(res1);
	free(res2);
	free(res3);
	free(res4);
}


int main(int argc, char **argv)
{
#if 1
	LoadTest(100,5000000);
#else
	LoadTest(1000,1000);
#endif
	return 0;
}

It’s enough to specify the options below when you build. It’ll make the compiler use Advanced SIMD (aka NEON) automatically.

-march=armv8-a -mcpu=cortex-a57 -O3
or
-march=armv8-a -mcpu=cortex-a57 -O2 -ftree-vectorize

since Denver emulates A57, above applies as it is to Denver.

If I set the programme running on the Denver2 CPU as flow:

CPU_ZERO(&mask);    //置空
CPU_SET(1, &mask);   //设置亲和力值	
sched_setaffinity(0, sizeof(mask), &mask);//设置线程CPU亲和力

If I also need to build the programme with -march=armv8-a -mcpu=cortex-a57 -O3 or -march=armv8-a -mcpu=cortex-a57 -O2 -ftree-vectorize ??