From a25c9a2925e64f3adb46a749ce18393aa01b1870 Mon Sep 17 00:00:00 2001 From: dante Date: Tue, 4 May 2021 21:09:57 +0200 Subject: tdf#137679 Use KahanSum for SSE2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change-Id: I97970cbb7a9562081f9a84b1d81423c80ed7f7f7 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/115113 Tested-by: Jenkins Reviewed-by: Tomaž Vajngerl --- sc/source/core/tool/arraysumSSE2.cxx | 52 +++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 6 deletions(-) (limited to 'sc/source/core') diff --git a/sc/source/core/tool/arraysumSSE2.cxx b/sc/source/core/tool/arraysumSSE2.cxx index 894675335834..e69f672b6014 100644 --- a/sc/source/core/tool/arraysumSSE2.cxx +++ b/sc/source/core/tool/arraysumSSE2.cxx @@ -27,25 +27,65 @@ double ArraySumFunctor::executeSSE2(size_t& i, const double* pCurrent) const __m128d sum3 = _mm_setzero_pd(); __m128d sum4 = _mm_setzero_pd(); + __m128d err1 = _mm_setzero_pd(); + __m128d err2 = _mm_setzero_pd(); + __m128d err3 = _mm_setzero_pd(); + __m128d err4 = _mm_setzero_pd(); + + __m128d y, t; + for (; i < nUnrolledSize; i += 8) { + // Kahan sum 1 __m128d load1 = _mm_load_pd(pCurrent); - sum1 = _mm_add_pd(sum1, load1); + y = _mm_sub_pd(load1, err1); + t = _mm_add_pd(sum1, y); + err1 = _mm_sub_pd(_mm_sub_pd(t, sum1), y); + sum1 = t; pCurrent += 2; + // Kahan sum 2 __m128d load2 = _mm_load_pd(pCurrent); - sum2 = _mm_add_pd(sum2, load2); + y = _mm_sub_pd(load2, err2); + t = _mm_add_pd(sum2, y); + err2 = _mm_sub_pd(_mm_sub_pd(t, sum2), y); + sum2 = t; pCurrent += 2; + // Kahan sum 3 __m128d load3 = _mm_load_pd(pCurrent); - sum3 = _mm_add_pd(sum3, load3); + y = _mm_sub_pd(load3, err3); + t = _mm_add_pd(sum3, y); + err3 = _mm_sub_pd(_mm_sub_pd(t, sum3), y); + sum3 = t; pCurrent += 2; + // Kahan sum 4 __m128d load4 = _mm_load_pd(pCurrent); - sum4 = _mm_add_pd(sum4, load4); + y = _mm_sub_pd(load4, err4); + t = _mm_add_pd(sum4, y); + err4 = _mm_sub_pd(_mm_sub_pd(t, sum4), y); + sum4 = t; pCurrent += 2; } - sum1 = _mm_add_pd(_mm_add_pd(sum1, sum2), _mm_add_pd(sum3, sum4)); + + // Now we combine pairwise summation with Kahan summation + + // sum 1 + sum 2 + y = _mm_sub_pd(sum2, err1); + t = _mm_add_pd(sum1, y); + err1 = _mm_sub_pd(_mm_sub_pd(t, sum1), y); + sum1 = t; + + // sum 3 + sum 4 + y = _mm_sub_pd(sum4, err3); + t = _mm_add_pd(sum3, y); + sum3 = t; + + // sum 1 + sum 3 + y = _mm_sub_pd(sum3, err1); + t = _mm_add_pd(sum1, y); + sum1 = t; double temp; @@ -62,4 +102,4 @@ double ArraySumFunctor::executeSSE2(size_t& i, const double* pCurrent) const return 0.0; #endif } -} \ No newline at end of file +} -- cgit v1.2.3