Reduce worst case algorithmic complexity of TextBlock::coalesce

The old algorithm restarts the inner loop for the RHS word from the beginning on each match, i.e. the worst case complexity approaches O(N^3), while O(N^2) is obviously sufficient for a pairwise compare of all words. Fortunately, O(N^2) is hardly ever happening, as the inner N is limited by a) the maxBaseIdx, b) removing duplicates from the set. For some pathological cases this changes the runtime from minutes to seconds. See poppler#1173.
author: Stefan Brüns <stefan.bruens@rwth-aachen.de> 2024-03-28 02:37:09 +0100
committer: Stefan Brüns <stefan.bruens@rwth-aachen.de> 2024-03-30 16:21:15 +0100
commit: f26b292412a9266aab46deb2ce1ffc4d016cc573 (patch)
tree: 8e18a1a0294f529b481e23ac31fe58a9285f0503
parent: 835987362d9873cf98cc3f86959910ff2107a509 (diff)
1 files changed, 60 insertions, 49 deletions
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 03b68bc2..69a205a9 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -1611,33 +1611,33 @@ void TextBlock::addWord(TextWord *word)
 
 void TextBlock::coalesce(const UnicodeMap *uMap, double fixedPitch)
 {
-    TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
-    TextLine *line, *line0, *line1;
-    int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
-    int baseIdx, bestWordBaseIdx, idx0, idx1;
-    double minBase, maxBase;
-    double fontSize, wordSpacing, delta, priDelta, secDelta;
-    TextLine **lineArray;
-    bool found, overlap;
-    int col1, col2;
-    int i, j, k;
-
     // discard duplicated text (fake boldface, drop shadows)
-    for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
-        word0 = pool->getPool(idx0);
+    for (int idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
+        // Get the first LHS word from the pool
+        TextWord *word0 = pool->getPool(idx0);
+
         while (word0) {
-            priDelta = dupMaxPriDelta * word0->fontSize;
-            secDelta = dupMaxSecDelta * word0->fontSize;
-            maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
-            found = false;
-            word1 = word2 = nullptr; // make gcc happy
-            for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
-                if (idx1 == idx0) {
-                    word1 = word0;
-                    word2 = word0->next;
+            double priDelta = dupMaxPriDelta * word0->fontSize;
+            double secDelta = dupMaxSecDelta * word0->fontSize;
+            double xDelta = ((rot == 0) || (rot == 2)) ? priDelta : secDelta;
+            double yDelta = ((rot == 0) || (rot == 2)) ? secDelta : priDelta;
+
+            int maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
+
+            for (int idx1 = idx0; idx1 <= maxBaseIdx; idx1++) {
+                TextWord *prevWord;
+                /* In case the RHS word is from the same pool as the LHS word,
+                 * start the inner loop with the word following the LHS word.
+                 * Otherwise, start with the second word from the subsequent pools
+                 * - the first word is compared at the end.
+                 */
+                if (idx0 == idx1) {
+                    prevWord = word0;
                 } else {
-                    word1 = nullptr;
-                    word2 = pool->getPool(idx1);
+                    prevWord = pool->getPool(idx1);
+                    if (!prevWord) {
+                        continue;
+                    }
                 }
                 TextWord *word1 = prevWord->next;
 
@@ -1645,40 +1645,51 @@ void TextBlock::coalesce(const UnicodeMap *uMap, double fixedPitch)
                     return std::equal(w1.chars.begin(), w1.chars.end(), w2.chars.begin(), w2.chars.end(), //
                                       [](auto c1, auto c2) { return c1.text == c2.text; });
                 };
-                for (; word2; word1 = word2, word2 = word2->next) {
-                    if (equalText(*word0, *word2)) {
-                        switch (rot) {
-                        case 0:
-                        case 2:
-                            found = fabs(word0->xMin - word2->xMin) < priDelta && fabs(word0->xMax - word2->xMax) < priDelta && fabs(word0->yMin - word2->yMin) < secDelta && fabs(word0->yMax - word2->yMax) < secDelta;
-                            break;
-                        case 1:
-                        case 3:
-                            found = fabs(word0->xMin - word2->xMin) < secDelta && fabs(word0->xMax - word2->xMax) < secDelta && fabs(word0->yMin - word2->yMin) < priDelta && fabs(word0->yMax - word2->yMax) < priDelta;
-                            break;
-                        }
+                auto match = [&equalText, xDelta, yDelta](const TextWord &w1, const TextWord &w2) -> bool {
+                    if (!equalText(w1, w2)) {
+                        return false;
                     }
-                    if (found) {
-                        break;
+                    return fabs(w1.xMin - w2.xMin) < xDelta && fabs(w1.xMax - w2.xMax) < xDelta //
+                            && fabs(w1.yMin - w2.yMin) < yDelta && fabs(w1.yMax - w2.yMax) < yDelta;
+                };
+
+                while (word1) {
+                    if (match(*word0, *word1)) {
+                        prevWord->next = word1->next;
+                        delete word1;
+                        word1 = prevWord->next;
+                    } else {
+                        prevWord = word1;
+                        word1 = word1->next;
                     }
                 }
-                if (found) {
-                    break;
+
+                // Check the first word from each subsequent pool
+                if (idx0 != idx1) {
+                    word1 = pool->getPool(idx1);
                 }
-            }
-            if (found) {
-                if (word1) {
-                    word1->next = word2->next;
-                } else {
-                    pool->setPool(idx1, word2->next);
+                if (word1 && match(*word0, *word1)) {
+                    pool->setPool(idx1, word1->next);
+                    delete word1;
                 }
-                delete word2;
-            } else {
-                word0 = word0->next;
             }
+
+            word0 = word0->next;
         }
     }
 
+    TextWord *word0, *word1;
+    TextWord *bestWord0, *bestWord1, *lastWord;
+    TextLine *line, *line0, *line1;
+    TextLine **lineArray;
+    int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
+    int baseIdx, bestWordBaseIdx;
+    double minBase, maxBase;
+    double fontSize, wordSpacing, delta;
+    bool overlap;
+    int col1, col2;
+    int i, j, k;
+
     // build the lines
     curLine = nullptr;
     poolMinBaseIdx = pool->minBaseIdx;
author	Stefan Brüns <stefan.bruens@rwth-aachen.de>	2024-03-28 02:37:09 +0100
committer	Stefan Brüns <stefan.bruens@rwth-aachen.de>	2024-03-30 16:21:15 +0100
commit	f26b292412a9266aab46deb2ce1ffc4d016cc573 (patch)
tree	8e18a1a0294f529b481e23ac31fe58a9285f0503
parent	835987362d9873cf98cc3f86959910ff2107a509 (diff)