summaryrefslogtreecommitdiff
path: root/sc/source/core/inc/arraysumfunctor.hxx
blob: 3955fd9451f69b6f43cfc161b883ca001350aa4b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 */

#ifndef INCLUDED_SC_SOURCE_CORE_INC_ARRAYSUMFUNCTOR_HXX
#define INCLUDED_SC_SOURCE_CORE_INC_ARRAYSUMFUNCTOR_HXX

#include <cstdint>
#include <rtl/math.hxx>
#include <tools/cpuid.hxx>

#if defined(LO_SSE2_AVAILABLE)
#include <emmintrin.h>
#endif

namespace sc
{

template<typename T, unsigned int N>
inline bool isAligned(const T* pointer)
{
    return 0 == (uintptr_t(pointer) % N);
}

struct ArraySumFunctor
{
private:
    const double* mpArray;
    size_t mnSize;

public:
    ArraySumFunctor(const double* pArray, size_t nSize)
        : mpArray(pArray)
        , mnSize(nSize)
    {
    }

    double operator() ()
    {
        static bool hasSSE2 = tools::cpuid::hasSSE2();

        double fSum = 0.0;
        size_t i = 0;
        const double* pCurrent = mpArray;

        if (hasSSE2)
        {
            while (!isAligned<double, 16>(pCurrent))
            {
                fSum += *pCurrent++;
                i++;
            }
            fSum += executeSSE2(i, pCurrent);
        }
        else
            fSum += executeUnrolled(i, pCurrent);

        // sum rest of the array

        for (; i < mnSize; ++i)
            fSum += mpArray[i];

        // If the sum is a NaN, some of the terms were empty cells, probably.
        // Re-calculate, carefully
        if (!rtl::math::isFinite(fSum))
        {
            sal_uInt32 nErr = reinterpret_cast< sal_math_Double * >(&fSum)->nan_parts.fraction_lo;
            if (nErr & 0xffff0000)
            {
                fSum = 0;
                for (i = 0; i < mnSize; i++)
                {
                    if (!rtl::math::isFinite(mpArray[i]))
                    {
                        nErr = reinterpret_cast< const sal_math_Double * >(&mpArray[i])->nan_parts.fraction_lo;
                        if (!(nErr & 0xffff0000))
                            fSum += mpArray[i]; // Let errors encoded as NaNs propagate ???
                    }
                    else
                        fSum += mpArray[i];
                }
            }
        }
        return fSum;
    }

private:
    inline double executeSSE2(size_t& i, const double* pCurrent) const
    {
#if defined(LO_SSE2_AVAILABLE)
        double fSum = 0.0;
        size_t nRealSize = mnSize - i;
        size_t nUnrolledSize = nRealSize - (nRealSize % 8);

        if (nUnrolledSize > 0)
        {
            __m128d sum1 = _mm_setzero_pd();
            __m128d sum2 = _mm_setzero_pd();
            __m128d sum3 = _mm_setzero_pd();
            __m128d sum4 = _mm_setzero_pd();

            for (; i < nUnrolledSize; i += 8)
            {
                __m128d load1 = _mm_load_pd(pCurrent);
                sum1 = _mm_add_pd(sum1, load1);
                pCurrent += 2;

                __m128d load2 = _mm_load_pd(pCurrent);
                sum2 = _mm_add_pd(sum2, load2);
                pCurrent += 2;

                __m128d load3 = _mm_load_pd(pCurrent);
                sum3 = _mm_add_pd(sum3, load3);
                pCurrent += 2;

                __m128d load4 = _mm_load_pd(pCurrent);
                sum4 = _mm_add_pd(sum4, load4);
                pCurrent += 2;
            }
            sum1 = _mm_add_pd(_mm_add_pd(sum1, sum2), _mm_add_pd(sum3, sum4));

            double temp;

            _mm_storel_pd(&temp, sum1);
            fSum += temp;

            _mm_storeh_pd(&temp, sum1);
            fSum += temp;
        }
        return fSum;
#else
        (void) i;
        (void) pCurrent;
        return 0.0;
#endif
    }

    inline double executeUnrolled(size_t& i, const double* pCurrent) const
    {
        size_t nRealSize = mnSize - i;
        size_t nUnrolledSize = nRealSize - (nRealSize % 4);

        if (nUnrolledSize > 0)
        {
            double sum0 = 0.0;
            double sum1 = 0.0;
            double sum2 = 0.0;
            double sum3 = 0.0;

            for (; i < nUnrolledSize; i += 4)
            {
                sum0 += *pCurrent++;
                sum1 += *pCurrent++;
                sum2 += *pCurrent++;
                sum3 += *pCurrent++;
            }
            return sum0 + sum1 + sum2 + sum3;
        }
        return 0.0;
    }
};

} // end namespace sc

#endif

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */