diff mbox series

[v2,3/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_8s

Message ID 20200819000212.211485-3-luiz.dentz@gmail.com
State Superseded
Headers show
Series [v2,1/3] sbc: Add initial code for SSE primitives | expand

Commit Message

Luiz Augusto von Dentz Aug. 19, 2020, 12:02 a.m. UTC
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>

This makes use 128 bit XMM registers whenever possible.

$ time src/sbcenc_mmx -s 8 sin_64m.au > /dev/null
real    0m1.064s
user    0m1.012s
sys     0m0.049s
diff mbox series

Patch

===  After ====

$ time src/sbcenc -s 8 sin_64m.au > /dev/null
real    0m1.032s
user    0m0.996s
sys     0m0.033s
---
 sbc/sbc_primitives_sse.c | 109 ++++++++++++++++-----------------------
 1 file changed, 44 insertions(+), 65 deletions(-)

diff --git a/sbc/sbc_primitives_sse.c b/sbc/sbc_primitives_sse.c
index 2a903e1..9bff6cf 100644
--- a/sbc/sbc_primitives_sse.c
+++ b/sbc/sbc_primitives_sse.c
@@ -96,80 +96,59 @@  static inline void sbc_analyze_four_sse(const int16_t *in, int32_t *out,
 static inline void sbc_analyze_eight_sse(const int16_t *in, int32_t *out,
 							const FIXED_T *consts)
 {
-	static const SBC_ALIGNED int32_t round_c[2] = {
+	static const SBC_ALIGNED int32_t round_c[4] = {
+		1 << (SBC_PROTO_FIXED8_SCALE - 1),
+		1 << (SBC_PROTO_FIXED8_SCALE - 1),
 		1 << (SBC_PROTO_FIXED8_SCALE - 1),
 		1 << (SBC_PROTO_FIXED8_SCALE - 1),
 	};
 	__asm__ volatile (
-		"movq        (%0), %%mm0\n"
-		"movq       8(%0), %%mm1\n"
-		"movq      16(%0), %%mm2\n"
-		"movq      24(%0), %%mm3\n"
-		"pmaddwd     (%1), %%mm0\n"
-		"pmaddwd    8(%1), %%mm1\n"
-		"pmaddwd   16(%1), %%mm2\n"
-		"pmaddwd   24(%1), %%mm3\n"
-		"paddd       (%2), %%mm0\n"
-		"paddd       (%2), %%mm1\n"
-		"paddd       (%2), %%mm2\n"
-		"paddd       (%2), %%mm3\n"
+		"movdqu      (%0), %%xmm0\n"
+		"movdqu    16(%0), %%xmm1\n"
+		"pmaddwd     (%1), %%xmm0\n"
+		"pmaddwd   16(%1), %%xmm1\n"
+		"paddd       (%2), %%xmm0\n"
+		"paddd       (%2), %%xmm1\n"
 		"\n"
-		"movq      32(%0), %%mm4\n"
-		"movq      40(%0), %%mm5\n"
-		"movq      48(%0), %%mm6\n"
-		"movq      56(%0), %%mm7\n"
-		"pmaddwd   32(%1), %%mm4\n"
-		"pmaddwd   40(%1), %%mm5\n"
-		"pmaddwd   48(%1), %%mm6\n"
-		"pmaddwd   56(%1), %%mm7\n"
-		"paddd      %%mm4, %%mm0\n"
-		"paddd      %%mm5, %%mm1\n"
-		"paddd      %%mm6, %%mm2\n"
-		"paddd      %%mm7, %%mm3\n"
+		"movdqu    32(%0), %%xmm2\n"
+		"movdqu    48(%0), %%xmm3\n"
+		"pmaddwd   32(%1), %%xmm2\n"
+		"pmaddwd   48(%1), %%xmm3\n"
+		"paddd     %%xmm2, %%xmm0\n"
+		"paddd     %%xmm3, %%xmm1\n"
 		"\n"
-		"movq      64(%0), %%mm4\n"
-		"movq      72(%0), %%mm5\n"
-		"movq      80(%0), %%mm6\n"
-		"movq      88(%0), %%mm7\n"
-		"pmaddwd   64(%1), %%mm4\n"
-		"pmaddwd   72(%1), %%mm5\n"
-		"pmaddwd   80(%1), %%mm6\n"
-		"pmaddwd   88(%1), %%mm7\n"
-		"paddd      %%mm4, %%mm0\n"
-		"paddd      %%mm5, %%mm1\n"
-		"paddd      %%mm6, %%mm2\n"
-		"paddd      %%mm7, %%mm3\n"
+		"movdqu    64(%0), %%xmm2\n"
+		"movdqu    80(%0), %%xmm3\n"
+		"pmaddwd   64(%1), %%xmm2\n"
+		"pmaddwd   80(%1), %%xmm3\n"
+		"paddd     %%xmm2, %%xmm0\n"
+		"paddd     %%xmm3, %%xmm1\n"
 		"\n"
-		"movq      96(%0), %%mm4\n"
-		"movq     104(%0), %%mm5\n"
-		"movq     112(%0), %%mm6\n"
-		"movq     120(%0), %%mm7\n"
-		"pmaddwd   96(%1), %%mm4\n"
-		"pmaddwd  104(%1), %%mm5\n"
-		"pmaddwd  112(%1), %%mm6\n"
-		"pmaddwd  120(%1), %%mm7\n"
-		"paddd      %%mm4, %%mm0\n"
-		"paddd      %%mm5, %%mm1\n"
-		"paddd      %%mm6, %%mm2\n"
-		"paddd      %%mm7, %%mm3\n"
+		"movdqu    96(%0), %%xmm2\n"
+		"movdqu   112(%0), %%xmm3\n"
+		"pmaddwd   96(%1), %%xmm2\n"
+		"pmaddwd  112(%1), %%xmm3\n"
+		"paddd     %%xmm2, %%xmm0\n"
+		"paddd     %%xmm3, %%xmm1\n"
 		"\n"
-		"movq     128(%0), %%mm4\n"
-		"movq     136(%0), %%mm5\n"
-		"movq     144(%0), %%mm6\n"
-		"movq     152(%0), %%mm7\n"
-		"pmaddwd  128(%1), %%mm4\n"
-		"pmaddwd  136(%1), %%mm5\n"
-		"pmaddwd  144(%1), %%mm6\n"
-		"pmaddwd  152(%1), %%mm7\n"
-		"paddd      %%mm4, %%mm0\n"
-		"paddd      %%mm5, %%mm1\n"
-		"paddd      %%mm6, %%mm2\n"
-		"paddd      %%mm7, %%mm3\n"
+		"movdqu    128(%0), %%xmm2\n"
+		"movdqu    144(%0), %%xmm3\n"
+		"pmaddwd   128(%1), %%xmm2\n"
+		"pmaddwd   144(%1), %%xmm3\n"
+		"paddd      %%xmm2, %%xmm0\n"
+		"paddd      %%xmm3, %%xmm1\n"
+		"\n"
+		"psrad         %4, %%xmm0\n"
+		"psrad         %4, %%xmm1\n"
 		"\n"
-		"psrad         %4, %%mm0\n"
-		"psrad         %4, %%mm1\n"
-		"psrad         %4, %%mm2\n"
-		"psrad         %4, %%mm3\n"
+		"movdqa     %%xmm0, %%xmm2\n"
+		"movdqa     %%xmm1, %%xmm3\n"
+		"punpckhqdq %%xmm2, %%xmm2\n"
+		"punpckhqdq %%xmm3, %%xmm3\n"
+		"movdq2q    %%xmm0, %%mm0\n"
+		"movdq2q    %%xmm2, %%mm1\n"
+		"movdq2q    %%xmm1, %%mm2\n"
+		"movdq2q    %%xmm3, %%mm3\n"
 		"\n"
 		"packssdw   %%mm0, %%mm0\n"
 		"packssdw   %%mm1, %%mm1\n"