diff options
author | Albert Astals Cid <aacid@kde.org> | 2012-02-06 00:25:53 +0100 |
---|---|---|
committer | Albert Astals Cid <aacid@kde.org> | 2012-02-06 00:25:53 +0100 |
commit | e17f09563276ee25b6acfc127b6ea360da650030 (patch) | |
tree | 897d75a0dff44559515d940b1a59dacc3bccdb9e | |
parent | 548648bf29dc1551443eb1925814342e7aadee46 (diff) |
[xpdf303] TextOutputDev and associated changes
-rw-r--r-- | cpp/poppler-page.cpp | 10 | ||||
-rw-r--r-- | glib/poppler-page.cc | 5 | ||||
-rw-r--r-- | poppler/ArthurOutputDev.cc | 1 | ||||
-rw-r--r-- | poppler/CairoOutputDev.cc | 2 | ||||
-rw-r--r-- | poppler/Gfx.cc | 6 | ||||
-rw-r--r-- | poppler/PSOutputDev.cc | 6 | ||||
-rw-r--r-- | poppler/TextOutputDev.cc | 360 | ||||
-rw-r--r-- | poppler/TextOutputDev.h | 18 | ||||
-rw-r--r-- | qt4/src/poppler-page.cc | 12 | ||||
-rw-r--r-- | test/perf-test.cc | 2 | ||||
-rw-r--r-- | utils/pdftotext.1 | 4 | ||||
-rw-r--r-- | utils/pdftotext.cc | 10 |
12 files changed, 270 insertions, 166 deletions
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp index 4e2f730b..d72a4776 100644 --- a/cpp/poppler-page.cpp +++ b/cpp/poppler-page.cpp @@ -208,24 +208,24 @@ bool page::search(const ustring &text, rectf &r, search_direction_enum direction double rect_right = r.right(); double rect_bottom = r.bottom(); - TextOutputDev td(NULL, gTrue, gFalse, gFalse); + TextOutputDev td(NULL, gTrue, 0, gFalse, gFalse); d->doc->doc->displayPage(&td, d->index + 1, 72, 72, rotation_value, false, true, false); TextPage *text_page = td.takeText(); switch (direction) { case search_from_top: found = text_page->findText(&u[0], len, - gTrue, gTrue, gFalse, gFalse, sCase, gFalse, + gTrue, gTrue, gFalse, gFalse, sCase, gFalse, gFalse, &rect_left, &rect_top, &rect_right, &rect_bottom); break; case search_next_result: found = text_page->findText(&u[0], len, - gFalse, gTrue, gTrue, gFalse, sCase, gFalse, + gFalse, gTrue, gTrue, gFalse, sCase, gFalse, gFalse, &rect_left, &rect_top, &rect_right, &rect_bottom); break; case search_previous_result: found = text_page->findText(&u[0], len, - gFalse, gTrue, gTrue, gFalse, sCase, gTrue, + gFalse, gTrue, gTrue, gFalse, sCase, gTrue, gFalse, &rect_left, &rect_top, &rect_right, &rect_bottom); break; } @@ -267,7 +267,7 @@ ustring page::text(const rectf &r, text_layout_enum layout_mode) const { std::auto_ptr<GooString> s; const GBool use_raw_order = (layout_mode == raw_order_layout); - TextOutputDev td(0, gFalse, use_raw_order, gFalse); + TextOutputDev td(0, gFalse, 0, use_raw_order, gFalse); d->doc->doc->displayPage(&td, d->index + 1, 72, 72, 0, false, true, false); if (r.is_empty()) { const PDFRectangle *rect = d->page->getCropBox(); diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc index 7b98625f..a95ff6ab 100644 --- a/glib/poppler-page.cc +++ b/glib/poppler-page.cc @@ -263,7 +263,7 @@ poppler_page_get_text_page (PopplerPage *page) TextOutputDev *text_dev; Gfx *gfx; - text_dev = new TextOutputDev (NULL, gTrue, gFalse, gFalse); + text_dev = new TextOutputDev (NULL, gTrue, 0, gFalse, gFalse); gfx = page->page->createGfx(text_dev, 72.0, 72.0, 0, gFalse, /* useMediaBox */ @@ -888,6 +888,7 @@ poppler_page_find_text (PopplerPage *page, gFalse, gTrue, // startAtTop, stopAtBottom gFalse, gFalse, // startAtLast, stopAtLast gFalse, gFalse, // caseSensitive, backwards + gFalse, // wholeWord &xMin, &yMin, &xMax, &yMax)) { match = poppler_rectangle_new (); @@ -1064,7 +1065,7 @@ poppler_page_render_to_ps (PopplerPage *page, ps_file->first_page, ps_file->last_page, psModePS, (int)ps_file->paper_width, (int)ps_file->paper_height, ps_file->duplex, - 0, 0, 0, 0, gFalse, gFalse); + 0, 0, 0, 0, gFalse); ps_file->document->doc->displayPage (ps_file->out, page->index + 1, 72.0, 72.0, diff --git a/poppler/ArthurOutputDev.cc b/poppler/ArthurOutputDev.cc index 92bc84f5..301232bf 100644 --- a/poppler/ArthurOutputDev.cc +++ b/poppler/ArthurOutputDev.cc @@ -292,6 +292,7 @@ void ArthurOutputDev::updateFont(GfxState *state) m_font = NULL; fileName = NULL; tmpBuf = NULL; + fontLoc = NULL; if (!(gfxFont = state->getFont())) { goto err1; diff --git a/poppler/CairoOutputDev.cc b/poppler/CairoOutputDev.cc index dc5698e7..340b3a3b 100644 --- a/poppler/CairoOutputDev.cc +++ b/poppler/CairoOutputDev.cc @@ -246,7 +246,7 @@ void CairoOutputDev::startPage(int pageNum, GfxState *state) { void CairoOutputDev::endPage() { if (text) { text->endPage(); - text->coalesce(gTrue, gFalse); + text->coalesce(gTrue, 0, gFalse); } } diff --git a/poppler/Gfx.cc b/poppler/Gfx.cc index 4fa94313..46e9c4a6 100644 --- a/poppler/Gfx.cc +++ b/poppler/Gfx.cc @@ -2866,7 +2866,7 @@ void Gfx::doRadialShFill(GfxRadialShading *shading) { GfxColor colorA, colorB; double xa, ya, xb, yb, ra, rb; double ta, tb, sa, sb; - double sz, sMin, sMax, h; + double sMin, sMax, h; double sLeft, sRight, sTop, sBottom, sZero, sDiag; GBool haveSLeft, haveSRight, haveSTop, haveSBottom, haveSZero; GBool haveSMin, haveSMax; @@ -2888,18 +2888,14 @@ void Gfx::doRadialShFill(GfxRadialShading *shading) { if (h == 0) { enclosed = gTrue; theta = 0; // make gcc happy - sz = 0; // make gcc happy } else if (r1 - r0 == 0) { enclosed = gFalse; theta = 0; - sz = 0; // make gcc happy } else if (fabs(r1 - r0) >= h) { enclosed = gTrue; theta = 0; // make gcc happy - sz = 0; // make gcc happy } else { enclosed = gFalse; - sz = -r0 / (r1 - r0); theta = asin((r1 - r0) / h); } diff --git a/poppler/PSOutputDev.cc b/poppler/PSOutputDev.cc index 7dbac8a8..d91cae9a 100644 --- a/poppler/PSOutputDev.cc +++ b/poppler/PSOutputDev.cc @@ -4178,7 +4178,7 @@ GBool PSOutputDev::radialShadedFill(GfxState *state, GfxRadialShading *shading, double xMin, yMin, xMax, yMax; double x0, y0, r0, x1, y1, r1, t0, t1; double xa, ya, ra; - double sz, sMin, sMax, h, ta; + double sMin, sMax, h, ta; double sLeft, sRight, sTop, sBottom, sZero, sDiag; GBool haveSLeft, haveSRight, haveSTop, haveSBottom, haveSZero; GBool haveSMin, haveSMax; @@ -4206,18 +4206,14 @@ GBool PSOutputDev::radialShadedFill(GfxState *state, GfxRadialShading *shading, if (h == 0) { enclosed = gTrue; theta = 0; // make gcc happy - sz = 0; // make gcc happy } else if (r1 - r0 == 0) { enclosed = gFalse; theta = 0; - sz = 0; // make gcc happy } else if (fabs(r1 - r0) >= h) { enclosed = gTrue; theta = 0; // make gcc happy - sz = 0; // make gcc happy } else { enclosed = gFalse; - sz = -r0 / (r1 - r0); theta = asin((r1 - r0) / h); } if (enclosed) { diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 7a0b8d99..531617df 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -663,7 +663,7 @@ void TextPool::addWord(TextWord *word) { // insert the new word if (cursor && wordBaseIdx == cursorBaseIdx && - word->primaryCmp(cursor) > 0) { + word->primaryCmp(cursor) >= 0) { w0 = cursor; w1 = cursor->next; } else { @@ -1053,7 +1053,7 @@ void TextLineFrag::computeCoords(GBool oneRot) { xMax = blk->xMin + d1 * (blk->xMax - blk->xMin); yMin = blk->yMin + d2 * (blk->yMax - blk->yMin); yMax = blk->yMin + d3 * (blk->yMax - blk->yMin); - base = blk->yMin + base * (blk->yMax - blk->yMin); + base = blk->yMin + d4 * (blk->yMax - blk->yMin); break; case 1: xMin = blk->xMax - d3 * (blk->xMax - blk->xMin); @@ -1277,15 +1277,15 @@ void TextBlock::addWord(TextWord *word) { } } -void TextBlock::coalesce(UnicodeMap *uMap) { +void TextBlock::coalesce(UnicodeMap *uMap, double fixedPitch) { TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord; TextLine *line, *line0, *line1; int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx; int baseIdx, bestWordBaseIdx, idx0, idx1; double minBase, maxBase; - double fontSize, delta, priDelta, secDelta; + double fontSize, wordSpacing, delta, priDelta, secDelta; TextLine **lineArray; - GBool found; + GBool found, overlap; int col1, col2; int i, j, k; @@ -1295,11 +1295,7 @@ void TextBlock::coalesce(UnicodeMap *uMap) { while (word0) { priDelta = dupMaxPriDelta * word0->fontSize; secDelta = dupMaxSecDelta * word0->fontSize; - if (rot == 0 || rot == 3) { - maxBaseIdx = pool->getBaseIdx(word0->base + secDelta); - } else { - maxBaseIdx = pool->getBaseIdx(word0->base - secDelta); - } + maxBaseIdx = pool->getBaseIdx(word0->base + secDelta); found = gFalse; word1 = word2 = NULL; // make gcc happy for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) { @@ -1396,6 +1392,7 @@ void TextBlock::coalesce(UnicodeMap *uMap) { maxBase = word0->base + maxIntraLineDelta * fontSize; minBaseIdx = pool->getBaseIdx(minBase); maxBaseIdx = pool->getBaseIdx(maxBase); + wordSpacing = fixedPitch ? fixedPitch : maxWordSpacing * fontSize; // find the rest of the words in this line while (1) { @@ -1404,25 +1401,32 @@ void TextBlock::coalesce(UnicodeMap *uMap) { // this line bestWordBaseIdx = 0; bestWord0 = bestWord1 = NULL; - for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) { + overlap = gFalse; + for (baseIdx = minBaseIdx; + !overlap && baseIdx <= maxBaseIdx; + ++baseIdx) { for (word0 = NULL, word1 = pool->getPool(baseIdx); word1; word0 = word1, word1 = word1->next) { if (word1->base >= minBase && - word1->base <= maxBase && - (delta = lastWord->primaryDelta(word1)) >= - minCharSpacing * fontSize) { - if (delta < maxWordSpacing * fontSize && - (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) { - bestWordBaseIdx = baseIdx; - bestWord0 = word0; - bestWord1 = word1; + word1->base <= maxBase) { + delta = lastWord->primaryDelta(word1); + if (delta < minCharSpacing * fontSize) { + overlap = gTrue; + break; + } else { + if (delta < wordSpacing && + (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) { + bestWordBaseIdx = baseIdx; + bestWord0 = word0; + bestWord1 = word1; + } + break; } - break; } } } - if (!bestWord1) { + if (overlap || !bestWord1) { break; } @@ -1469,52 +1473,79 @@ void TextBlock::coalesce(UnicodeMap *uMap) { // column assignment nColumns = 0; - for (i = 0; i < nLines; ++i) { - line0 = lineArray[i]; - col1 = 0; - for (j = 0; j < i; ++j) { - line1 = lineArray[j]; - if (line1->primaryDelta(line0) >= 0) { - col2 = line1->col[line1->len] + 1; - } else { - k = 0; // make gcc happy - switch (rot) { - case 0: - for (k = 0; - k < line1->len && - line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]); - ++k) ; - break; - case 1: - for (k = 0; - k < line1->len && - line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]); - ++k) ; - break; - case 2: - for (k = 0; - k < line1->len && - line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]); - ++k) ; - break; - case 3: - for (k = 0; - k < line1->len && - line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]); - ++k) ; - break; - } - col2 = line1->col[k]; + if (fixedPitch) { + for (i = 0; i < nLines; ++i) { + line0 = lineArray[i]; + col1 = 0; // make gcc happy + switch (rot) { + case 0: + col1 = (int)((line0->xMin - xMin) / fixedPitch + 0.5); + break; + case 1: + col1 = (int)((line0->yMin - yMin) / fixedPitch + 0.5); + break; + case 2: + col1 = (int)((xMax - line0->xMax) / fixedPitch + 0.5); + break; + case 3: + col1 = (int)((yMax - line0->yMax) / fixedPitch + 0.5); + break; } - if (col2 > col1) { - col1 = col2; + for (k = 0; k <= line0->len; ++k) { + line0->col[k] += col1; + } + if (line0->col[line0->len] > nColumns) { + nColumns = line0->col[line0->len]; } } - for (k = 0; k <= line0->len; ++k) { - line0->col[k] += col1; - } - if (line0->col[line0->len] > nColumns) { - nColumns = line0->col[line0->len]; + } else { + for (i = 0; i < nLines; ++i) { + line0 = lineArray[i]; + col1 = 0; + for (j = 0; j < i; ++j) { + line1 = lineArray[j]; + if (line1->primaryDelta(line0) >= 0) { + col2 = line1->col[line1->len] + 1; + } else { + k = 0; // make gcc happy + switch (rot) { + case 0: + for (k = 0; + k < line1->len && + line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]); + ++k) ; + break; + case 1: + for (k = 0; + k < line1->len && + line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]); + ++k) ; + break; + case 2: + for (k = 0; + k < line1->len && + line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]); + ++k) ; + break; + case 3: + for (k = 0; + k < line1->len && + line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]); + ++k) ; + break; + } + col2 = line1->col[k]; + } + if (col2 > col1) { + col1 = col2; + } + } + for (k = 0; k <= line0->len; ++k) { + line0->col[k] += col1; + } + if (line0->col[line0->len] > nColumns) { + nColumns = line0->col[line0->len]; + } } } gfree(lineArray); @@ -2111,6 +2142,8 @@ void TextPage::clear() { gfree(blocks); } deleteGooList(fonts, TextFontInfo); + deleteGooList(underlines, TextUnderline); + deleteGooList(links, TextLink); curWord = NULL; charPos = 0; @@ -2128,6 +2161,8 @@ void TextPage::clear() { rawWords = NULL; rawLastWord = NULL; fonts = new GooList(); + underlines = new GooList(); + links = new GooList(); } void TextPage::updateFont(GfxState *state) { @@ -2426,7 +2461,7 @@ void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link) links->append(new TextLink(xMin, yMin, xMax, yMax, link)); } -void TextPage::coalesce(GBool physLayout, GBool doHTML) { +void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { UnicodeMap *uMap; TextPool *pool; TextWord *word0, *word1, *word2; @@ -2454,7 +2489,7 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { blkList = NULL; lastBlk = NULL; nBlocks = 0; - primaryRot = -1; + primaryRot = 0; #if 0 // for debugging printf("*** initial words ***\n"); @@ -2918,7 +2953,7 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { //~ addition to primary rotation // coalesce the block, and add it to the list - blk->coalesce(uMap); + blk->coalesce(uMap, fixedPitch); if (lastBlk) { lastBlk->next = blk; } else { @@ -2926,11 +2961,12 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { } lastBlk = blk; count[rot] += blk->charCount; - if (primaryRot < 0 || count[rot] > count[primaryRot]) { - primaryRot = rot; - } ++nBlocks; } + + if (count[rot] > count[primaryRot]) { + primaryRot = rot; + } } #if 0 // for debugging @@ -2992,75 +3028,108 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { // sort blocks into xy order for column assignment if (blocks) gfree (blocks); - blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *)); - for (blk = blkList, i = 0; blk; blk = blk->next, ++i) { - blocks[i] = blk; - } - qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot); + if (physLayout && fixedPitch) { - // column assignment - for (i = 0; i < nBlocks; ++i) { - blk0 = blocks[i]; - col1 = 0; - for (j = 0; j < i; ++j) { - blk1 = blocks[j]; - col2 = 0; // make gcc happy + blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *)); + for (blk = blkList, i = 0; blk; blk = blk->next, ++i) { + blocks[i] = blk; + col1 = 0; // make gcc happy switch (primaryRot) { case 0: - if (blk0->xMin > blk1->xMax) { - col2 = blk1->col + blk1->nColumns + 3; - } else if (blk1->xMax == blk1->xMin) { - col2 = blk1->col; - } else { - col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) / - (blk1->xMax - blk1->xMin)) * - blk1->nColumns); - } + col1 = (int)(blk->xMin / fixedPitch + 0.5); break; case 1: - if (blk0->yMin > blk1->yMax) { - col2 = blk1->col + blk1->nColumns + 3; - } else if (blk1->yMax == blk1->yMin) { - col2 = blk1->col; - } else { - col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) / - (blk1->yMax - blk1->yMin)) * - blk1->nColumns); - } + col1 = (int)(blk->yMin / fixedPitch + 0.5); break; case 2: - if (blk0->xMax < blk1->xMin) { - col2 = blk1->col + blk1->nColumns + 3; - } else if (blk1->xMin == blk1->xMax) { - col2 = blk1->col; - } else { - col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) / - (blk1->xMin - blk1->xMax)) * - blk1->nColumns); - } + col1 = (int)((pageWidth - blk->xMax) / fixedPitch + 0.5); break; case 3: - if (blk0->yMax < blk1->yMin) { - col2 = blk1->col + blk1->nColumns + 3; - } else if (blk1->yMin == blk1->yMax) { - col2 = blk1->col; - } else { - col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) / - (blk1->yMin - blk1->yMax)) * - blk1->nColumns); - } + col1 = (int)((pageHeight - blk->yMax) / fixedPitch + 0.5); break; } - if (col2 > col1) { - col1 = col2; + blk->col = col1; + for (line = blk->lines; line; line = line->next) { + for (j = 0; j <= line->len; ++j) { + line->col[j] += col1; + } } } - blk0->col = col1; - for (line = blk0->lines; line; line = line->next) { - for (j = 0; j <= line->len; ++j) { - line->col[j] += col1; + + } else { + + // sort blocks into xy order for column assignment + blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *)); + for (blk = blkList, i = 0; blk; blk = blk->next, ++i) { + blocks[i] = blk; + } + qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot); + + // column assignment + for (i = 0; i < nBlocks; ++i) { + blk0 = blocks[i]; + col1 = 0; + for (j = 0; j < i; ++j) { + blk1 = blocks[j]; + col2 = 0; // make gcc happy + switch (primaryRot) { + case 0: + if (blk0->xMin > blk1->xMax) { + col2 = blk1->col + blk1->nColumns + 3; + } else if (blk1->xMax == blk1->xMin) { + col2 = blk1->col; + } else { + col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) / + (blk1->xMax - blk1->xMin)) * + blk1->nColumns); + } + break; + case 1: + if (blk0->yMin > blk1->yMax) { + col2 = blk1->col + blk1->nColumns + 3; + } else if (blk1->yMax == blk1->yMin) { + col2 = blk1->col; + } else { + col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) / + (blk1->yMax - blk1->yMin)) * + blk1->nColumns); + } + break; + case 2: + if (blk0->xMax < blk1->xMin) { + col2 = blk1->col + blk1->nColumns + 3; + } else if (blk1->xMin == blk1->xMax) { + col2 = blk1->col; + } else { + col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) / + (blk1->xMin - blk1->xMax)) * + blk1->nColumns); + } + break; + case 3: + if (blk0->yMax < blk1->yMin) { + col2 = blk1->col + blk1->nColumns + 3; + } else if (blk1->yMin == blk1->yMax) { + col2 = blk1->col; + } else { + col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) / + (blk1->yMin - blk1->yMax)) * + blk1->nColumns); + } + break; + } + if (col2 > col1) { + col1 = col2; + } + } + blk0->col = col1; + for (line = blk0->lines; line; line = line->next) { + for (j = 0; j <= line->len; ++j) { + line->col[j] += col1; + } } } + } #if 0 // for debugging @@ -3070,7 +3139,7 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col, blk->nColumns); for (line = blk->lines; line; line = line->next) { - printf(" line:\n"); + printf(" line: col[0]=%d\n", line->col[0]); for (word0 = line->words; word0; word0 = word0->next) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, @@ -3470,6 +3539,7 @@ GBool TextPage::findText(Unicode *s, int len, GBool startAtTop, GBool stopAtBottom, GBool startAtLast, GBool stopAtLast, GBool caseSensitive, GBool backward, + GBool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax) { TextBlock *blk; @@ -3527,25 +3597,35 @@ GBool TextPage::findText(Unicode *s, int len, blk = blocks[i]; // check: is the block above the top limit? - if (!startAtTop && (backward ? blk->yMin > yStart : blk->yMax < yStart)) { + // (this only works if the page's primary rotation is zero -- + // otherwise the blocks won't be sorted in the useful order) + if (!startAtTop && primaryRot == 0 && + (backward ? blk->yMin > yStart : blk->yMax < yStart)) { continue; } // check: is the block below the bottom limit? - if (!stopAtBottom && (backward ? blk->yMax < yStop : blk->yMin > yStop)) { + // (this only works if the page's primary rotation is zero -- + // otherwise the blocks won't be sorted in the useful order) + if (!stopAtBottom && primaryRot == 0 && + (backward ? blk->yMax < yStop : blk->yMin > yStop)) { break; } for (line = blk->lines; line; line = line->next) { // check: is the line above the top limit? - if (!startAtTop && + // (this only works if the page's primary rotation is zero -- + // otherwise the lines won't be sorted in the useful order) + if (!startAtTop && primaryRot == 0 && (backward ? line->yMin > yStart : line->yMin < yStart)) { continue; } // check: is the line below the bottom limit? - if (!stopAtBottom && + // (this only works if the page's primary rotation is zero -- + // otherwise the lines won't be sorted in the useful order) + if (!stopAtBottom && primaryRot == 0 && (backward ? line->yMin < yStop : line->yMin > yStop)) { continue; } @@ -3564,9 +3644,9 @@ GBool TextPage::findText(Unicode *s, int len, for (k = 0; k < m; ++k) { txt[k] = unicodeToUpper(line->normalized[k]); } - } else { + } else { txt = line->normalized; - } + } // search each position in this line j = backward ? m - len : 0; @@ -5211,9 +5291,11 @@ static void TextOutputDev_outputToFile(void *stream, const char *text, int len) } TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA, - GBool rawOrderA, GBool append) { + double fixedPitchA, GBool rawOrderA, + GBool append) { text = NULL; physLayout = physLayoutA; + fixedPitch = physLayout ? fixedPitchA : 0; rawOrder = rawOrderA; doHTML = gFalse; ok = gTrue; @@ -5246,11 +5328,13 @@ TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA, } TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, - GBool physLayoutA, GBool rawOrderA) { + GBool physLayoutA, double fixedPitchA, + GBool rawOrderA) { outputFunc = func; outputStream = stream; needClose = gFalse; physLayout = physLayoutA; + fixedPitch = physLayout ? fixedPitchA : 0; rawOrder = rawOrderA; doHTML = gFalse; text = new TextPage(rawOrderA); @@ -5277,12 +5361,16 @@ void TextOutputDev::startPage(int pageNum, GfxState *state) { void TextOutputDev::endPage() { text->endPage(); - text->coalesce(physLayout, doHTML); + text->coalesce(physLayout, fixedPitch, doHTML); if (outputStream) { text->dump(outputStream, outputFunc, physLayout); } } +void TextOutputDev::restoreState(GfxState *state) { + text->updateFont(state); +} + void TextOutputDev::updateFont(GfxState *state) { text->updateFont(state); } @@ -5465,10 +5553,12 @@ GBool TextOutputDev::findText(Unicode *s, int len, GBool startAtTop, GBool stopAtBottom, GBool startAtLast, GBool stopAtLast, GBool caseSensitive, GBool backward, + GBool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax) { return text->findText(s, len, startAtTop, stopAtBottom, - startAtLast, stopAtLast, caseSensitive, backward, + startAtLast, stopAtLast, + caseSensitive, backward, wholeWord, xMin, yMin, xMax, yMax); } diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index fd34c8bb..e31876b9 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -333,7 +333,7 @@ public: void addWord(TextWord *word); - void coalesce(UnicodeMap *uMap); + void coalesce(UnicodeMap *uMap, double fixedPitch); // Update this block's priMin and priMax values, looking at <blk>. void updatePriMinMax(TextBlock *blk); @@ -521,7 +521,7 @@ public: void addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link); // Coalesce strings that look like parts of the same line. - void coalesce(GBool physLayout, GBool doHTML); + void coalesce(GBool physLayout, double fixedPitch, GBool doHTML); // Find a string. If <startAtTop> is true, starts looking at the // top of the page; else if <startAtLast> is true, starts looking @@ -534,6 +534,7 @@ public: GBool startAtTop, GBool stopAtBottom, GBool startAtLast, GBool stopAtLast, GBool caseSensitive, GBool backward, + GBool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax); @@ -676,14 +677,16 @@ public: // is maintained. If <rawOrder> is true, the text is kept in // content stream order. TextOutputDev(char *fileName, GBool physLayoutA, - GBool rawOrderA, GBool append); + double fixedPitchA, GBool rawOrderA, + GBool append); // Create a TextOutputDev which will write to a generic stream. If // <physLayoutA> is true, the original physical layout of the text // is maintained. If <rawOrder> is true, the text is kept in // content stream order. TextOutputDev(TextOutputFunc func, void *stream, - GBool physLayoutA, GBool rawOrderA); + GBool physLayoutA, double fixedPitchA, + GBool rawOrderA); // Destructor. virtual ~TextOutputDev(); @@ -719,6 +722,9 @@ public: // End a page. virtual void endPage(); + //----- save/restore graphics state + virtual void restoreState(GfxState *state); + //----- update text state virtual void updateFont(GfxState *state); @@ -754,6 +760,7 @@ public: GBool startAtTop, GBool stopAtBottom, GBool startAtLast, GBool stopAtLast, GBool caseSensitive, GBool backward, + GBool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax); @@ -804,6 +811,9 @@ private: TextPage *text; // text for the current page GBool physLayout; // maintain original physical layout when // dumping text + double fixedPitch; // if physLayout is true and this is non-zero, + // assume fixed-pitch characters with this + // width GBool rawOrder; // keep text in content stream order GBool doHTML; // extra processing for HTML conversion GBool ok; // set up ok? diff --git a/qt4/src/poppler-page.cc b/qt4/src/poppler-page.cc index 9dc1d15b..398a69b8 100644 --- a/qt4/src/poppler-page.cc +++ b/qt4/src/poppler-page.cc @@ -330,7 +330,7 @@ QString Page::text(const QRectF &r, TextLayout textLayout) const QString result; const GBool rawOrder = textLayout == RawOrderLayout; - output_dev = new TextOutputDev(0, gFalse, rawOrder, gFalse); + output_dev = new TextOutputDev(0, gFalse, 0, rawOrder, gFalse); m_page->parentDoc->doc->displayPageSlice(output_dev, m_page->index + 1, 72, 72, 0, false, true, false, -1, -1, -1, -1); if (r.isNull()) @@ -371,19 +371,19 @@ bool Page::search(const QString &text, double &sLeft, double &sTop, double &sRig int rotation = (int)rotate * 90; // fetch ourselves a textpage - TextOutputDev td(NULL, gTrue, gFalse, gFalse); + TextOutputDev td(NULL, gTrue, 0, gFalse, gFalse); m_page->parentDoc->doc->displayPage( &td, m_page->index + 1, 72, 72, rotation, false, true, false ); TextPage *textPage=td.takeText(); if (direction == FromTop) found = textPage->findText( u.data(), len, - gTrue, gTrue, gFalse, gFalse, sCase, gFalse, &sLeft, &sTop, &sRight, &sBottom ); + gTrue, gTrue, gFalse, gFalse, sCase, gFalse, gFalse, &sLeft, &sTop, &sRight, &sBottom ); else if ( direction == NextResult ) found = textPage->findText( u.data(), len, - gFalse, gTrue, gTrue, gFalse, sCase, gFalse, &sLeft, &sTop, &sRight, &sBottom ); + gFalse, gTrue, gTrue, gFalse, sCase, gFalse, gFalse, &sLeft, &sTop, &sRight, &sBottom ); else if ( direction == PreviousResult ) found = textPage->findText( u.data(), len, - gFalse, gTrue, gTrue, gFalse, sCase, gTrue, &sLeft, &sTop, &sRight, &sBottom ); + gFalse, gTrue, gTrue, gFalse, sCase, gTrue, gFalse, &sLeft, &sTop, &sRight, &sBottom ); textPage->decRefCnt(); @@ -414,7 +414,7 @@ QList<TextBox*> Page::textList(Rotation rotate) const QList<TextBox*> output_list; - output_dev = new TextOutputDev(0, gFalse, gFalse, gFalse); + output_dev = new TextOutputDev(0, gFalse, 0, gFalse, gFalse); int rotation = (int)rotate * 90; diff --git a/test/perf-test.cc b/test/perf-test.cc index b6fb8f89..6bdda97d 100644 --- a/test/perf-test.cc +++ b/test/perf-test.cc @@ -840,7 +840,7 @@ static void RenderPdfAsText(const char *fileName) LogInfo("started: %s\n", fileName);
- TextOutputDev * textOut = new TextOutputDev(NULL, gTrue, gFalse, gFalse);
+ TextOutputDev * textOut = new TextOutputDev(NULL, gTrue, 0, gFalse, gFalse);
if (!textOut->isOk()) {
delete textOut;
return;
diff --git a/utils/pdftotext.1 b/utils/pdftotext.1 index 587f1a9c..88fbf702 100644 --- a/utils/pdftotext.1 +++ b/utils/pdftotext.1 @@ -53,6 +53,10 @@ Maintain (as best as possible) the original physical layout of the text. The default is to \'undo' physical layout (columns, hyphenation, etc.) and output the text in reading order. .TP +.BI \-fixed " number" +Assume fixed-pitch (or tabular) text, with the specified character +width (in points). This forces physical layout mode. +.TP .B \-raw Keep the text in content stream order. This is a hack which often "undoes" column formatting, etc. Use of raw mode is no longer diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc index 2e7b32e5..a170f1b7 100644 --- a/utils/pdftotext.cc +++ b/utils/pdftotext.cc @@ -68,6 +68,7 @@ static int w = 0; static int h = 0; static GBool bbox = gFalse; static GBool physLayout = gFalse; +static double fixedPitch = 0; static GBool rawOrder = gFalse; static GBool htmlMeta = gFalse; static char textEncName[128] = ""; @@ -97,6 +98,8 @@ static const ArgDesc argDesc[] = { "height of crop area in pixels (default is 0)"}, {"-layout", argFlag, &physLayout, 0, "maintain original physical layout"}, + {"-fixed", argFP, &fixedPitch, 0, + "assume fixed-pitch (or tabular) text"}, {"-raw", argFlag, &rawOrder, 0, "keep strings in content stream order"}, {"-htmlmeta", argFlag, &htmlMeta, 0, @@ -197,6 +200,9 @@ int main(int argc, char *argv[]) { } fileName = new GooString(argv[1]); + if (fixedPitch) { + physLayout = gTrue; + } if (textEncName[0]) { globalParams->setTextEncoding(textEncName); @@ -333,7 +339,7 @@ int main(int argc, char *argv[]) { // write text file if (bbox) { - textOut = new TextOutputDev(NULL, physLayout, rawOrder, htmlMeta); + textOut = new TextOutputDev(NULL, physLayout, fixedPitch, rawOrder, htmlMeta); if (!(f = fopen(textFileName->getCString(), "ab"))) { error(errIO, -1, "Couldn't open text file '{0:t}' for append", textFileName); exitCode = 2; @@ -367,7 +373,7 @@ int main(int argc, char *argv[]) { fclose(f); } else { textOut = new TextOutputDev(textFileName->getCString(), - physLayout, rawOrder, htmlMeta); + physLayout, fixedPitch, rawOrder, htmlMeta); if (textOut->isOk()) { if ((w==0) && (h==0) && (x==0) && (y==0)) { doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0, |