I’m trying to compile with some NEON asm and it compiles fine but the linker says “unknown mnemonic” to all the instructions.
nm, got it working by adding import <arm_neon.h> and the code below, saves 5-10% CPU and works but X86 is better at this:
void M4x4_SSE(float *base, float *mult, float *result) {
float32x4_t c0, c1, c2, c3;
float32x4_t r0, r1, r2, r3;
c0 = vld1q_f32(&mult[0]);
c1 = vld1q_f32(&mult[4]);
c2 = vld1q_f32(&mult[8]);
c3 = vld1q_f32(&mult[12]);
for(int i=0; i<4; i++) {
r0 = vmulq_n_f32(c0, base[4*i + 0]);
r1 = vmulq_n_f32(c1, base[4*i + 1]);
r2 = vmulq_n_f32(c2, base[4*i + 2]);
r3 = vmulq_n_f32(c3, base[4*i + 3]);
r0 = vaddq_f32(r0, r1);
r2 = vaddq_f32(r2, r3);
r0 = vaddq_f32(r0, r2);
vst1q_f32(&result[4*i], r0);
}
}