diff mbox series

[SBC,2/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_4s

Message ID 20200811181623.3683374-2-luiz.dentz@gmail.com
State New
Headers show
Series None | expand

Commit Message

Luiz Augusto von Dentz Aug. 11, 2020, 6:16 p.m. UTC
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>

This makes use 128 bit XMM registers whenever possible.

$ time src/sbcenc_mmx -s 4 sin_4m.au > /dev/null
real    0m1.073s
user    0m1.039s
sys     0m0.030s
diff mbox series

Patch

===  After ====

$ time src/sbcenc -s 4 sin_4m.au > /dev/null
real    0m1.049s
user    0m1.000s
sys     0m0.047s
---
 sbc/sbc_primitives_sse.c | 58 +++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 33 deletions(-)

diff --git a/sbc/sbc_primitives_sse.c b/sbc/sbc_primitives_sse.c
index c2b729a..6471bd5 100644
--- a/sbc/sbc_primitives_sse.c
+++ b/sbc/sbc_primitives_sse.c
@@ -38,48 +38,40 @@ 
 static inline void sbc_analyze_four_sse(const int16_t *in, int32_t *out,
 					const FIXED_T *consts)
 {
-	static const SBC_ALIGNED int32_t round_c[2] = {
+	static const SBC_ALIGNED int32_t round_c[4] = {
+		1 << (SBC_PROTO_FIXED4_SCALE - 1),
+		1 << (SBC_PROTO_FIXED4_SCALE - 1),
 		1 << (SBC_PROTO_FIXED4_SCALE - 1),
 		1 << (SBC_PROTO_FIXED4_SCALE - 1),
 	};
 	__asm__ volatile (
-		"movq        (%0), %%mm0\n"
-		"movq       8(%0), %%mm1\n"
-		"pmaddwd     (%1), %%mm0\n"
-		"pmaddwd    8(%1), %%mm1\n"
-		"paddd       (%2), %%mm0\n"
-		"paddd       (%2), %%mm1\n"
+		"movdqu      (%0), %%xmm0\n"
+		"pmaddwd     (%1), %%xmm0\n"
+		"paddd       (%2), %%xmm0\n"
 		"\n"
-		"movq      16(%0), %%mm2\n"
-		"movq      24(%0), %%mm3\n"
-		"pmaddwd   16(%1), %%mm2\n"
-		"pmaddwd   24(%1), %%mm3\n"
-		"paddd      %%mm2, %%mm0\n"
-		"paddd      %%mm3, %%mm1\n"
+		"movdqu    16(%0), %%xmm1\n"
+		"pmaddwd   16(%1), %%xmm1\n"
+		"paddd     %%xmm1, %%xmm0\n"
 		"\n"
-		"movq      32(%0), %%mm2\n"
-		"movq      40(%0), %%mm3\n"
-		"pmaddwd   32(%1), %%mm2\n"
-		"pmaddwd   40(%1), %%mm3\n"
-		"paddd      %%mm2, %%mm0\n"
-		"paddd      %%mm3, %%mm1\n"
+		"movdqu    32(%0), %%xmm1\n"
+		"pmaddwd   32(%1), %%xmm1\n"
+		"paddd     %%xmm1, %%xmm0\n"
 		"\n"
-		"movq      48(%0), %%mm2\n"
-		"movq      56(%0), %%mm3\n"
-		"pmaddwd   48(%1), %%mm2\n"
-		"pmaddwd   56(%1), %%mm3\n"
-		"paddd      %%mm2, %%mm0\n"
-		"paddd      %%mm3, %%mm1\n"
+		"movdqu    48(%0), %%xmm1\n"
+		"pmaddwd   48(%1), %%xmm1\n"
+		"paddd     %%xmm1, %%xmm0\n"
 		"\n"
-		"movq      64(%0), %%mm2\n"
-		"movq      72(%0), %%mm3\n"
-		"pmaddwd   64(%1), %%mm2\n"
-		"pmaddwd   72(%1), %%mm3\n"
-		"paddd      %%mm2, %%mm0\n"
-		"paddd      %%mm3, %%mm1\n"
+		"movdqu    64(%0), %%xmm1\n"
+		"pmaddwd   64(%1), %%xmm1\n"
+		"paddd     %%xmm1, %%xmm0\n"
+		"\n"
+		"psrad         %4, %%xmm0\n"
+		"\n"
+		"movdqa    %%xmm0, %%xmm1\n"
+		"punpckhqdq %%xmm1, %%xmm1\n"
+		"movdq2q   %%xmm0, %%mm0\n"
+		"movdq2q   %%xmm1, %%mm1\n"
 		"\n"
-		"psrad         %4, %%mm0\n"
-		"psrad         %4, %%mm1\n"
 		"packssdw   %%mm0, %%mm0\n"
 		"packssdw   %%mm1, %%mm1\n"
 		"\n"