summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTomaž Vajngerl <tomaz.vajngerl@collabora.com>2015-11-13 12:41:16 +0100
committerTomaž Vajngerl <tomaz.vajngerl@collabora.co.uk>2015-11-13 14:10:10 +0100
commit5493402fb37a1def960c93f7c31aff36a5ab5f9e (patch)
tree73d1b7b23a09b244b00415563f33e5d77118a3f2
parent154bcd887d3772addc8196944044fa57738d3cf2 (diff)
arraysumfunctor: fast sum a double array, use for SUM() in Calc
This adds an array sum functor which sums a double array in a as fast as possible way. There are 2 implementations: SSE2 and a simple unrolled implementation. SSE2 implementation is used if SSE2 is detected at runtime. Additional info: SSE implementation at first processes the array until the array is aligned by 16-bit boundary (should only process 1 element). Then the array is processed by summing 8 values in one pass (using 4 variables that are 128-bit wide) where SSE operation can process 2 double values in one call. Change-Id: I24494b08cae049aa3eabcb086867f1bdd4128374
-rw-r--r--sc/source/core/inc/arraysumfunctor.hxx141
-rw-r--r--sc/source/core/tool/interpr6.cxx15
2 files changed, 145 insertions, 11 deletions
diff --git a/sc/source/core/inc/arraysumfunctor.hxx b/sc/source/core/inc/arraysumfunctor.hxx
new file mode 100644
index 000000000000..776c5143732e
--- /dev/null
+++ b/sc/source/core/inc/arraysumfunctor.hxx
@@ -0,0 +1,141 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ */
+
+#ifndef INCLUDED_SC_SOURCE_CORE_INC_ARRAYSUMFUNCTOR_HXX
+#define INCLUDED_SC_SOURCE_CORE_INC_ARRAYSUMFUNCTOR_HXX
+
+#include <emmintrin.h>
+#include <tools/cpuid.hxx>
+
+namespace sc
+{
+
+template<typename T, unsigned int N>
+inline bool isAligned(const T* pointer)
+{
+ return 0 == (uintptr_t(pointer) % N);
+}
+
+struct ArraySumFunctor
+{
+private:
+ const double* mpArray;
+ size_t mnSize;
+
+public:
+ ArraySumFunctor(const double* pArray, size_t nSize)
+ : mpArray(pArray)
+ , mnSize(nSize)
+ {
+ }
+
+ double operator() ()
+ {
+ static bool hasSSE2 = tools::cpuid::hasSSE2();
+
+ double fSum = 0.0;
+ size_t i = 0;
+ const double* pCurrent = mpArray;
+
+ if (hasSSE2)
+ {
+ while (!isAligned<double, 16>(pCurrent))
+ {
+ fSum += *pCurrent++;
+ i++;
+ }
+ fSum += executeSSE2(i, pCurrent);
+ }
+ else
+ fSum += executeUnrolled(i, pCurrent);
+
+ // sum rest of the array
+
+ for (; i < mnSize; ++i)
+ fSum += mpArray[i];
+
+ return fSum;
+ }
+
+private:
+ inline double executeSSE2(size_t& i, const double* pCurrent) const
+ {
+ double fSum = 0.0;
+ size_t nRealSize = mnSize - i;
+ size_t nUnrolledSize = nRealSize - (nRealSize % 8);
+
+ if (nUnrolledSize > 0)
+ {
+ __m128d sum1 = _mm_setzero_pd();
+ __m128d sum2 = _mm_setzero_pd();
+ __m128d sum3 = _mm_setzero_pd();
+ __m128d sum4 = _mm_setzero_pd();
+
+ for (; i < nUnrolledSize; i += 8)
+ {
+ __m128d load1 = _mm_load_pd(pCurrent);
+ sum1 = _mm_add_pd(sum1, load1);
+ pCurrent += 2;
+
+ __m128d load2 = _mm_load_pd(pCurrent);
+ sum2 = _mm_add_pd(sum2, load2);
+ pCurrent += 2;
+
+ __m128d load3 = _mm_load_pd(pCurrent);
+ sum3 = _mm_add_pd(sum3, load3);
+ pCurrent += 2;
+
+ __m128d load4 = _mm_load_pd(pCurrent);
+ sum4 = _mm_add_pd(sum4, load4);
+ pCurrent += 2;
+ }
+ sum1 = _mm_add_pd(_mm_add_pd(sum1, sum2), _mm_add_pd(sum3, sum4));
+
+ double temp;
+
+ _mm_storel_pd(&temp, sum1);
+ fSum += temp;
+
+ _mm_storeh_pd(&temp, sum1);
+ fSum += temp;
+ }
+ return fSum;
+ }
+
+ inline double executeUnrolled(size_t& i, const double* pCurrent) const
+ {
+ size_t nRealSize = mnSize - i;
+ size_t nUnrolledSize = nRealSize - (nRealSize % 4);
+
+ if (nUnrolledSize > 0)
+ {
+ double sum0 = 0.0;
+ double sum1 = 0.0;
+ double sum2 = 0.0;
+ double sum3 = 0.0;
+
+ for (; i < nUnrolledSize; i += 4)
+ {
+ sum0 += *pCurrent++;
+ sum1 += *pCurrent++;
+ sum2 += *pCurrent++;
+ sum3 += *pCurrent++;
+ }
+ return sum0 + sum1 + sum2 + sum3;
+ }
+ return 0.0;
+ }
+};
+
+} // end namespace sc
+
+#endif
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/core/tool/interpr6.cxx b/sc/source/core/tool/interpr6.cxx
index a4a75f2d6e7d..5bf453300c81 100644
--- a/sc/source/core/tool/interpr6.cxx
+++ b/sc/source/core/tool/interpr6.cxx
@@ -26,6 +26,8 @@
#include "mtvcellfunc.hxx"
#include "scmatrix.hxx"
+#include "arraysumfunctor.hxx"
+
#include <formula/token.hxx>
using namespace formula;
@@ -235,18 +237,9 @@ public:
if (nDataSize == 0)
return;
- size_t nUnrolled = (nDataSize & 0x3) >> 2;
+ sc::ArraySumFunctor functor(p, nDataSize);
- // Try to encourage the compiler/CPU to do something sensible for the next.
- for (i = 0; i < nUnrolled; i+=4)
- {
- mfRest += p[i];
- mfRest += p[i+1];
- mfRest += p[i+2];
- mfRest += p[i+3];
- }
- for (; i < nDataSize; ++i)
- mfRest += p[i];
+ mfRest += functor();
break;
}