properly separate code built with different CPU settings

Trying to write smart code and mixing different CPU flags doesn't play nice together. Those global variables are not runtime-protected by a CPU check, and so may crash with illegal instruction error. And those inline functions may not get inlined and the compiler is free to choose just one copy, any of them, so it may be the one requiring the most demanding CPU settings. So use only dumb code in files compiled with CPU intrinsics. Change-Id: I8200fd4d9f991fab6fdc741120e7aa96ff9b470d Reviewed-on: https://gerrit.libreoffice.org/c/core/+/121929 Tested-by: Jenkins Reviewed-by: Dante DM <dante19031999@gmail.com> Reviewed-by: Luboš Luňák <l.lunak@collabora.com>
author: Luboš Luňák <l.lunak@collabora.com> 2021-09-10 23:43:33 +0200
committer: Luboš Luňák <l.lunak@collabora.com> 2021-09-13 11:08:20 +0200
commit: 26072b8db7ba53f00c83197cb064229a76001989 (patch)
tree: d144b6b8094f91a92456306cfc0b31d602fc3577 /sc
parent: f75bae7082322a2e79997d041ce5f8afd3910107 (diff)
8 files changed, 144 insertions, 59 deletions
diff --git a/sc/inc/arraysumfunctor.hxx b/sc/inc/arraysumfunctor.hxx
index ecd428e9f037..d251b4a6f9fb 100644
--- a/sc/inc/arraysumfunctor.hxx
+++ b/sc/inc/arraysumfunctor.hxx
@@ -12,35 +12,17 @@
 
 #include <cmath>
 #include "kahan.hxx"
-#include "scdllapi.h"
+#include "arraysumfunctorinternal.hxx"
 #include <tools/cpuid.hxx>
 #include <formula/errorcodes.hxx>
 
 namespace sc::op
 {
 /* Checkout available optimization options */
-SC_DLLPUBLIC extern const bool hasAVX512F;
 const bool hasAVX = cpuid::hasAVX();
 const bool hasSSE2 = cpuid::hasSSE2();
 
 /**
-  * Performs one step of the Neumanier sum between doubles
-  * Overwrites the summand and error
-  * @parma sum
-  * @param err
-  * @param value
-  */
-inline void sumNeumanierNormal(double& sum, double& err, const double& value)
-{
-    double t = sum + value;
-    if (std::abs(sum) >= std::abs(value))
-        err += (sum - t) + value;
-    else
-        err += (value - t) + sum;
-    sum = t;
-}
-
-/**
   * If no boosts available, Unrolled KahanSum.
   * Most likely to use on android.
   */
@@ -69,11 +51,6 @@ static inline KahanSum executeUnrolled(size_t& i, size_t nSize, const double* pC
     return 0.0;
 }
 
-/* Available methods */
-SC_DLLPUBLIC KahanSum executeAVX512F(size_t& i, size_t nSize, const double* pCurrent);
-SC_DLLPUBLIC KahanSum executeAVX(size_t& i, size_t nSize, const double* pCurrent);
-SC_DLLPUBLIC KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent);
-
 /**
   * This function task is to choose the fastest method available to perform the sum.
   * @param i
diff --git a/sc/inc/arraysumfunctorinternal.hxx b/sc/inc/arraysumfunctorinternal.hxx
new file mode 100644
index 000000000000..a06e3fc17439
--- /dev/null
+++ b/sc/inc/arraysumfunctorinternal.hxx
@@ -0,0 +1,34 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include "scdllapi.h"
+
+namespace sc::op
+{
+SC_DLLPUBLIC extern const bool hasAVX512F;
+
+// Plain old data structure, to be used by code compiled with CPU intrinsics without generating any
+// code for it (so that code requiring intrinsics doesn't get accidentally selected as the one copy
+// when merging duplicates).
+struct KahanSumSimple
+{
+    double m_fSum;
+    double m_fError;
+};
+
+/* Available methods */
+SC_DLLPUBLIC KahanSumSimple executeAVX512F(size_t& i, size_t nSize, const double* pCurrent);
+SC_DLLPUBLIC KahanSumSimple executeAVX(size_t& i, size_t nSize, const double* pCurrent);
+SC_DLLPUBLIC KahanSumSimple executeSSE2(size_t& i, size_t nSize, const double* pCurrent);
+
+} // namespace
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */
diff --git a/sc/inc/kahan.hxx b/sc/inc/kahan.hxx
index 3404fb6d14a6..ded7bd78d70e 100644
--- a/sc/inc/kahan.hxx
+++ b/sc/inc/kahan.hxx
@@ -11,6 +11,8 @@
 
 #include <cmath>
 
+#include "arraysumfunctorinternal.hxx"
+
 /**
   * This class provides LO with Kahan summation algorithm
   * About this algorithm: https://en.wikipedia.org/wiki/Kahan_summation_algorithm
@@ -34,6 +36,12 @@ public:
     {
     }
 
+    constexpr KahanSum(const sc::op::KahanSumSimple& sum)
+        : m_fSum(sum.m_fSum)
+        , m_fError(sum.m_fError)
+    {
+    }
+
     constexpr KahanSum(const KahanSum& fSum) = default;
 
 public:
diff --git a/sc/qa/unit/functions_statistical.cxx b/sc/qa/unit/functions_statistical.cxx
index 2e489d26dd0d..a034964f4923 100644
--- a/sc/qa/unit/functions_statistical.cxx
+++ b/sc/qa/unit/functions_statistical.cxx
@@ -37,13 +37,13 @@ void StatisticalFunctionsTest::testIntrinsicSums()
     double* pCurrent = summands;
     size_t i = 0;
     if (sc::op::hasAVX512F)
-        CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeAVX512F(i, 9, pCurrent).get());
+        CPPUNIT_ASSERT_EQUAL(42.0, KahanSum(sc::op::executeAVX512F(i, 9, pCurrent)).get());
     i = 0;
     if (sc::op::hasAVX)
-        CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeAVX(i, 9, pCurrent).get());
+        CPPUNIT_ASSERT_EQUAL(42.0, KahanSum(sc::op::executeAVX(i, 9, pCurrent)).get());
     i = 0;
     if (sc::op::hasSSE2)
-        CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeSSE2(i, 9, pCurrent).get());
+        CPPUNIT_ASSERT_EQUAL(42.0, KahanSum(sc::op::executeSSE2(i, 9, pCurrent)).get());
     i = 0;
     CPPUNIT_ASSERT_EQUAL(42.0, sc::op::executeUnrolled(i, 9, pCurrent).get());
 }
diff --git a/sc/source/core/tool/arraysum.hxx b/sc/source/core/tool/arraysum.hxx
new file mode 100644
index 000000000000..62c2514182b7
--- /dev/null
+++ b/sc/source/core/tool/arraysum.hxx
@@ -0,0 +1,52 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ */
+
+#pragma once
+
+#include <cmath>
+
+namespace sc::op
+{
+// Code must not be shared between different CPU instrinsics flags (e.g. in debug mode the compiler would not
+// inline them, and merge the copies, keeping only the one with the most demanding CPU set that's not available otherwise).
+// Put everything in a different namespace and additionally try to force inlining.
+namespace LO_ARRAYSUM_SPACE
+{
+#if defined _MSC_VER
+#define INLINE __forceinline static
+#elif defined __GNUC__
+#define INLINE __attribute__((always_inline)) static inline
+#else
+#define static inline
+#endif
+
+/**
+  * Performs one step of the Neumanier sum between doubles
+  * Overwrites the summand and error
+  * @parma sum
+  * @param err
+  * @param value
+  */
+INLINE void sumNeumanierNormal(double& sum, double& err, const double& value)
+{
+    double t = sum + value;
+    if (std::abs(sum) >= std::abs(value))
+        err += (sum - t) + value;
+    else
+        err += (value - t) + sum;
+    sum = t;
+}
+
+#undef INLINE
+
+} // end namespace LO_ARRAYSUM_SPACE
+} // end namespace sc::op
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/core/tool/arraysumAVX.cxx b/sc/source/core/tool/arraysumAVX.cxx
index 49407b95dfb6..54c47780f63c 100644
--- a/sc/source/core/tool/arraysumAVX.cxx
+++ b/sc/source/core/tool/arraysumAVX.cxx
@@ -8,21 +8,28 @@
  *
  */
 
-#include <arraysumfunctor.hxx>
-#include <sal/log.hxx>
+#define LO_ARRAYSUM_SPACE AVX
+#include "arraysum.hxx"
+
+#include <arraysumfunctorinternal.hxx>
+
 #include <tools/simd.hxx>
 #include <tools/simdsupport.hxx>
 
+#include <cstdlib>
+
 namespace sc::op
 {
 #ifdef LO_AVX_AVAILABLE // Old processors
 
-const __m256d ANNULATE_SIGN_BIT = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF));
+using namespace AVX;
 
 /** Kahan sum with AVX.
   */
 static inline void sumAVX(__m256d& sum, __m256d& err, const __m256d& value)
 {
+    const __m256d ANNULATE_SIGN_BIT
+        = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF));
     // Temporal parameter
     __m256d t = _mm256_add_pd(sum, value);
     // Absolute value of the total sum
@@ -45,7 +52,7 @@ static inline void sumAVX(__m256d& sum, __m256d& err, const __m256d& value)
 
 /** Execute Kahan sum with AVX.
   */
-KahanSum executeAVX(size_t& i, size_t nSize, const double* pCurrent)
+KahanSumSimple executeAVX(size_t& i, size_t nSize, const double* pCurrent)
 {
 #ifdef LO_AVX_AVAILABLE
     // Make sure we don't fall out of bounds.
@@ -97,15 +104,14 @@ KahanSum executeAVX(size_t& i, size_t nSize, const double* pCurrent)
         sumNeumanierNormal(sums[0], errs[0], errs[2]);
 
         // Store result
-        return KahanSum(sums[0], errs[0]);
+        return { sums[0], errs[0] };
     }
-    return 0.0;
+    return { 0.0, 0.0 };
 #else
-    SAL_WARN("sc", "Failed to use AVX");
     (void)i;
     (void)nSize;
     (void)pCurrent;
-    return 0.0;
+    abort();
 #endif
 }
 
diff --git a/sc/source/core/tool/arraysumAVX512.cxx b/sc/source/core/tool/arraysumAVX512.cxx
index 0fa49c6bccc8..f8e8de729279 100644
--- a/sc/source/core/tool/arraysumAVX512.cxx
+++ b/sc/source/core/tool/arraysumAVX512.cxx
@@ -8,11 +8,16 @@
  *
  */
 
-#include <arraysumfunctor.hxx>
-#include <sal/log.hxx>
+#define LO_ARRAYSUM_SPACE AVX512
+#include "arraysum.hxx"
+
+#include <arraysumfunctorinternal.hxx>
+
 #include <tools/simd.hxx>
 #include <tools/simdsupport.hxx>
 
+#include <cstdlib>
+
 /* TODO Remove this once GCC updated and AVX512 can work. */
 #ifdef __GNUC__
 #if __GNUC__ < 9
@@ -23,16 +28,18 @@
 #endif
 #endif
 
+namespace sc::op
+{
 #ifdef LO_AVX512F_AVAILABLE
-const bool sc::op::hasAVX512F = cpuid::hasAVX512F();
+const bool hasAVX512F = cpuid::hasAVX512F();
 #else
-const bool sc::op::hasAVX512F = false;
+const bool hasAVX512F = false;
 #endif
 
-namespace sc::op
-{
 #ifdef LO_AVX512F_AVAILABLE // New processors
 
+using namespace AVX512;
+
 /** Kahan sum with AVX512.
   */
 static inline void sumAVX512(__m512d& sum, __m512d& err, const __m512d& value)
@@ -59,7 +66,7 @@ static inline void sumAVX512(__m512d& sum, __m512d& err, const __m512d& value)
 
 /** Execute Kahan sum with AVX512.
   */
-KahanSum executeAVX512F(size_t& i, size_t nSize, const double* pCurrent)
+KahanSumSimple executeAVX512F(size_t& i, size_t nSize, const double* pCurrent)
 {
 #ifdef LO_AVX512F_AVAILABLE // New processors
     // Make sure we don't fall out of bounds.
@@ -85,8 +92,8 @@ KahanSum executeAVX512F(size_t& i, size_t nSize, const double* pCurrent)
         static_assert(sizeof(double) == 8);
         double sums[8];
         double errs[8];
-        _mm512_storeu_pd(static_cast<void*>(&sums[0]), sum);
-        _mm512_storeu_pd(static_cast<void*>(&errs[0]), err);
+        _mm512_storeu_pd(&sums[0], sum);
+        _mm512_storeu_pd(&errs[0], err);
 
         // First Kahan & pairwise summation
         // 0+1 1+2 3+4 4+5 6+7 -> 0, 2, 4, 6
@@ -112,16 +119,14 @@ KahanSum executeAVX512F(size_t& i, size_t nSize, const double* pCurrent)
         sumNeumanierNormal(sums[0], errs[0], errs[4]);
 
         // Return final result
-        return KahanSum(sums[0], errs[0]);
+        return { sums[0], errs[0] };
     }
-    else
-        return 0.0;
+    return { 0.0, 0.0 };
 #else
-    SAL_WARN("sc", "Failed to use AVX 512");
     (void)i;
     (void)nSize;
     (void)pCurrent;
-    return 0.0;
+    abort();
 #endif
 }
 
diff --git a/sc/source/core/tool/arraysumSSE2.cxx b/sc/source/core/tool/arraysumSSE2.cxx
index e2ab945acc4a..95b42f868461 100644
--- a/sc/source/core/tool/arraysumSSE2.cxx
+++ b/sc/source/core/tool/arraysumSSE2.cxx
@@ -8,23 +8,27 @@
  *
  */
 
-#include <arraysumfunctor.hxx>
-#include <sal/log.hxx>
+#define LO_ARRAYSUM_SPACE SSE2
+#include "arraysum.hxx"
+
+#include <arraysumfunctorinternal.hxx>
+
 #include <tools/simd.hxx>
 #include <tools/simdsupport.hxx>
 
-//AVX512VL + AVX512F + KNCNI
+#include <cstdlib>
 
 namespace sc::op
 {
 #ifdef LO_SSE2_AVAILABLE // Old processors
 
-const __m128d ANNULATE_SIGN_BIT = _mm_castsi128_pd(_mm_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF));
+using namespace SSE2;
 
-/** Kahan sum with SSE4.2.
+/** Kahan sum with SSE2.
   */
 static inline void sumSSE2(__m128d& sum, __m128d& err, const __m128d& value)
 {
+    const __m128d ANNULATE_SIGN_BIT = _mm_castsi128_pd(_mm_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF));
     // Temporal parameter
     __m128d t = _mm_add_pd(sum, value);
     // Absolute value of the total sum
@@ -47,7 +51,7 @@ static inline void sumSSE2(__m128d& sum, __m128d& err, const __m128d& value)
 
 /** Execute Kahan sum with SSE2.
   */
-KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent)
+KahanSumSimple executeSSE2(size_t& i, size_t nSize, const double* pCurrent)
 {
 #ifdef LO_SSE2_AVAILABLE
     // Make sure we don't fall out of bounds.
@@ -113,15 +117,14 @@ KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent)
         sumNeumanierNormal(sums[0], errs[0], errs[1]);
 
         // Store result
-        return KahanSum(sums[0], errs[0]);
+        return { sums[0], errs[0] };
     }
-    return 0.0;
+    return { 0.0, 0.0 };
 #else
-    SAL_WARN("sc", "Failed to use SSE2");
     (void)i;
     (void)nSize;
     (void)pCurrent;
-    return 0.0;
+    abort();
 #endif
 }
 }
author	Luboš Luňák <l.lunak@collabora.com>	2021-09-10 23:43:33 +0200
committer	Luboš Luňák <l.lunak@collabora.com>	2021-09-13 11:08:20 +0200
commit	26072b8db7ba53f00c83197cb064229a76001989 (patch)
tree	d144b6b8094f91a92456306cfc0b31d602fc3577 /sc
parent	f75bae7082322a2e79997d041ce5f8afd3910107 (diff)