summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlbert Astals Cid <aacid@kde.org>2012-02-06 00:25:53 +0100
committerAlbert Astals Cid <aacid@kde.org>2012-02-06 00:25:53 +0100
commite17f09563276ee25b6acfc127b6ea360da650030 (patch)
tree897d75a0dff44559515d940b1a59dacc3bccdb9e
parent548648bf29dc1551443eb1925814342e7aadee46 (diff)
[xpdf303] TextOutputDev and associated changes
-rw-r--r--cpp/poppler-page.cpp10
-rw-r--r--glib/poppler-page.cc5
-rw-r--r--poppler/ArthurOutputDev.cc1
-rw-r--r--poppler/CairoOutputDev.cc2
-rw-r--r--poppler/Gfx.cc6
-rw-r--r--poppler/PSOutputDev.cc6
-rw-r--r--poppler/TextOutputDev.cc360
-rw-r--r--poppler/TextOutputDev.h18
-rw-r--r--qt4/src/poppler-page.cc12
-rw-r--r--test/perf-test.cc2
-rw-r--r--utils/pdftotext.14
-rw-r--r--utils/pdftotext.cc10
12 files changed, 270 insertions, 166 deletions
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index 4e2f730b..d72a4776 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -208,24 +208,24 @@ bool page::search(const ustring &text, rectf &r, search_direction_enum direction
double rect_right = r.right();
double rect_bottom = r.bottom();
- TextOutputDev td(NULL, gTrue, gFalse, gFalse);
+ TextOutputDev td(NULL, gTrue, 0, gFalse, gFalse);
d->doc->doc->displayPage(&td, d->index + 1, 72, 72, rotation_value, false, true, false);
TextPage *text_page = td.takeText();
switch (direction) {
case search_from_top:
found = text_page->findText(&u[0], len,
- gTrue, gTrue, gFalse, gFalse, sCase, gFalse,
+ gTrue, gTrue, gFalse, gFalse, sCase, gFalse, gFalse,
&rect_left, &rect_top, &rect_right, &rect_bottom);
break;
case search_next_result:
found = text_page->findText(&u[0], len,
- gFalse, gTrue, gTrue, gFalse, sCase, gFalse,
+ gFalse, gTrue, gTrue, gFalse, sCase, gFalse, gFalse,
&rect_left, &rect_top, &rect_right, &rect_bottom);
break;
case search_previous_result:
found = text_page->findText(&u[0], len,
- gFalse, gTrue, gTrue, gFalse, sCase, gTrue,
+ gFalse, gTrue, gTrue, gFalse, sCase, gTrue, gFalse,
&rect_left, &rect_top, &rect_right, &rect_bottom);
break;
}
@@ -267,7 +267,7 @@ ustring page::text(const rectf &r, text_layout_enum layout_mode) const
{
std::auto_ptr<GooString> s;
const GBool use_raw_order = (layout_mode == raw_order_layout);
- TextOutputDev td(0, gFalse, use_raw_order, gFalse);
+ TextOutputDev td(0, gFalse, 0, use_raw_order, gFalse);
d->doc->doc->displayPage(&td, d->index + 1, 72, 72, 0, false, true, false);
if (r.is_empty()) {
const PDFRectangle *rect = d->page->getCropBox();
diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc
index 7b98625f..a95ff6ab 100644
--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -263,7 +263,7 @@ poppler_page_get_text_page (PopplerPage *page)
TextOutputDev *text_dev;
Gfx *gfx;
- text_dev = new TextOutputDev (NULL, gTrue, gFalse, gFalse);
+ text_dev = new TextOutputDev (NULL, gTrue, 0, gFalse, gFalse);
gfx = page->page->createGfx(text_dev,
72.0, 72.0, 0,
gFalse, /* useMediaBox */
@@ -888,6 +888,7 @@ poppler_page_find_text (PopplerPage *page,
gFalse, gTrue, // startAtTop, stopAtBottom
gFalse, gFalse, // startAtLast, stopAtLast
gFalse, gFalse, // caseSensitive, backwards
+ gFalse, // wholeWord
&xMin, &yMin, &xMax, &yMax))
{
match = poppler_rectangle_new ();
@@ -1064,7 +1065,7 @@ poppler_page_render_to_ps (PopplerPage *page,
ps_file->first_page, ps_file->last_page,
psModePS, (int)ps_file->paper_width,
(int)ps_file->paper_height, ps_file->duplex,
- 0, 0, 0, 0, gFalse, gFalse);
+ 0, 0, 0, 0, gFalse);
ps_file->document->doc->displayPage (ps_file->out, page->index + 1, 72.0, 72.0,
diff --git a/poppler/ArthurOutputDev.cc b/poppler/ArthurOutputDev.cc
index 92bc84f5..301232bf 100644
--- a/poppler/ArthurOutputDev.cc
+++ b/poppler/ArthurOutputDev.cc
@@ -292,6 +292,7 @@ void ArthurOutputDev::updateFont(GfxState *state)
m_font = NULL;
fileName = NULL;
tmpBuf = NULL;
+ fontLoc = NULL;
if (!(gfxFont = state->getFont())) {
goto err1;
diff --git a/poppler/CairoOutputDev.cc b/poppler/CairoOutputDev.cc
index dc5698e7..340b3a3b 100644
--- a/poppler/CairoOutputDev.cc
+++ b/poppler/CairoOutputDev.cc
@@ -246,7 +246,7 @@ void CairoOutputDev::startPage(int pageNum, GfxState *state) {
void CairoOutputDev::endPage() {
if (text) {
text->endPage();
- text->coalesce(gTrue, gFalse);
+ text->coalesce(gTrue, 0, gFalse);
}
}
diff --git a/poppler/Gfx.cc b/poppler/Gfx.cc
index 4fa94313..46e9c4a6 100644
--- a/poppler/Gfx.cc
+++ b/poppler/Gfx.cc
@@ -2866,7 +2866,7 @@ void Gfx::doRadialShFill(GfxRadialShading *shading) {
GfxColor colorA, colorB;
double xa, ya, xb, yb, ra, rb;
double ta, tb, sa, sb;
- double sz, sMin, sMax, h;
+ double sMin, sMax, h;
double sLeft, sRight, sTop, sBottom, sZero, sDiag;
GBool haveSLeft, haveSRight, haveSTop, haveSBottom, haveSZero;
GBool haveSMin, haveSMax;
@@ -2888,18 +2888,14 @@ void Gfx::doRadialShFill(GfxRadialShading *shading) {
if (h == 0) {
enclosed = gTrue;
theta = 0; // make gcc happy
- sz = 0; // make gcc happy
} else if (r1 - r0 == 0) {
enclosed = gFalse;
theta = 0;
- sz = 0; // make gcc happy
} else if (fabs(r1 - r0) >= h) {
enclosed = gTrue;
theta = 0; // make gcc happy
- sz = 0; // make gcc happy
} else {
enclosed = gFalse;
- sz = -r0 / (r1 - r0);
theta = asin((r1 - r0) / h);
}
diff --git a/poppler/PSOutputDev.cc b/poppler/PSOutputDev.cc
index 7dbac8a8..d91cae9a 100644
--- a/poppler/PSOutputDev.cc
+++ b/poppler/PSOutputDev.cc
@@ -4178,7 +4178,7 @@ GBool PSOutputDev::radialShadedFill(GfxState *state, GfxRadialShading *shading,
double xMin, yMin, xMax, yMax;
double x0, y0, r0, x1, y1, r1, t0, t1;
double xa, ya, ra;
- double sz, sMin, sMax, h, ta;
+ double sMin, sMax, h, ta;
double sLeft, sRight, sTop, sBottom, sZero, sDiag;
GBool haveSLeft, haveSRight, haveSTop, haveSBottom, haveSZero;
GBool haveSMin, haveSMax;
@@ -4206,18 +4206,14 @@ GBool PSOutputDev::radialShadedFill(GfxState *state, GfxRadialShading *shading,
if (h == 0) {
enclosed = gTrue;
theta = 0; // make gcc happy
- sz = 0; // make gcc happy
} else if (r1 - r0 == 0) {
enclosed = gFalse;
theta = 0;
- sz = 0; // make gcc happy
} else if (fabs(r1 - r0) >= h) {
enclosed = gTrue;
theta = 0; // make gcc happy
- sz = 0; // make gcc happy
} else {
enclosed = gFalse;
- sz = -r0 / (r1 - r0);
theta = asin((r1 - r0) / h);
}
if (enclosed) {
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 7a0b8d99..531617df 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -663,7 +663,7 @@ void TextPool::addWord(TextWord *word) {
// insert the new word
if (cursor && wordBaseIdx == cursorBaseIdx &&
- word->primaryCmp(cursor) > 0) {
+ word->primaryCmp(cursor) >= 0) {
w0 = cursor;
w1 = cursor->next;
} else {
@@ -1053,7 +1053,7 @@ void TextLineFrag::computeCoords(GBool oneRot) {
xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
- base = blk->yMin + base * (blk->yMax - blk->yMin);
+ base = blk->yMin + d4 * (blk->yMax - blk->yMin);
break;
case 1:
xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
@@ -1277,15 +1277,15 @@ void TextBlock::addWord(TextWord *word) {
}
}
-void TextBlock::coalesce(UnicodeMap *uMap) {
+void TextBlock::coalesce(UnicodeMap *uMap, double fixedPitch) {
TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
TextLine *line, *line0, *line1;
int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
int baseIdx, bestWordBaseIdx, idx0, idx1;
double minBase, maxBase;
- double fontSize, delta, priDelta, secDelta;
+ double fontSize, wordSpacing, delta, priDelta, secDelta;
TextLine **lineArray;
- GBool found;
+ GBool found, overlap;
int col1, col2;
int i, j, k;
@@ -1295,11 +1295,7 @@ void TextBlock::coalesce(UnicodeMap *uMap) {
while (word0) {
priDelta = dupMaxPriDelta * word0->fontSize;
secDelta = dupMaxSecDelta * word0->fontSize;
- if (rot == 0 || rot == 3) {
- maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
- } else {
- maxBaseIdx = pool->getBaseIdx(word0->base - secDelta);
- }
+ maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
found = gFalse;
word1 = word2 = NULL; // make gcc happy
for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
@@ -1396,6 +1392,7 @@ void TextBlock::coalesce(UnicodeMap *uMap) {
maxBase = word0->base + maxIntraLineDelta * fontSize;
minBaseIdx = pool->getBaseIdx(minBase);
maxBaseIdx = pool->getBaseIdx(maxBase);
+ wordSpacing = fixedPitch ? fixedPitch : maxWordSpacing * fontSize;
// find the rest of the words in this line
while (1) {
@@ -1404,25 +1401,32 @@ void TextBlock::coalesce(UnicodeMap *uMap) {
// this line
bestWordBaseIdx = 0;
bestWord0 = bestWord1 = NULL;
- for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
+ overlap = gFalse;
+ for (baseIdx = minBaseIdx;
+ !overlap && baseIdx <= maxBaseIdx;
+ ++baseIdx) {
for (word0 = NULL, word1 = pool->getPool(baseIdx);
word1;
word0 = word1, word1 = word1->next) {
if (word1->base >= minBase &&
- word1->base <= maxBase &&
- (delta = lastWord->primaryDelta(word1)) >=
- minCharSpacing * fontSize) {
- if (delta < maxWordSpacing * fontSize &&
- (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
- bestWordBaseIdx = baseIdx;
- bestWord0 = word0;
- bestWord1 = word1;
+ word1->base <= maxBase) {
+ delta = lastWord->primaryDelta(word1);
+ if (delta < minCharSpacing * fontSize) {
+ overlap = gTrue;
+ break;
+ } else {
+ if (delta < wordSpacing &&
+ (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
+ bestWordBaseIdx = baseIdx;
+ bestWord0 = word0;
+ bestWord1 = word1;
+ }
+ break;
}
- break;
}
}
}
- if (!bestWord1) {
+ if (overlap || !bestWord1) {
break;
}
@@ -1469,52 +1473,79 @@ void TextBlock::coalesce(UnicodeMap *uMap) {
// column assignment
nColumns = 0;
- for (i = 0; i < nLines; ++i) {
- line0 = lineArray[i];
- col1 = 0;
- for (j = 0; j < i; ++j) {
- line1 = lineArray[j];
- if (line1->primaryDelta(line0) >= 0) {
- col2 = line1->col[line1->len] + 1;
- } else {
- k = 0; // make gcc happy
- switch (rot) {
- case 0:
- for (k = 0;
- k < line1->len &&
- line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
- ++k) ;
- break;
- case 1:
- for (k = 0;
- k < line1->len &&
- line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
- ++k) ;
- break;
- case 2:
- for (k = 0;
- k < line1->len &&
- line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
- ++k) ;
- break;
- case 3:
- for (k = 0;
- k < line1->len &&
- line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
- ++k) ;
- break;
- }
- col2 = line1->col[k];
+ if (fixedPitch) {
+ for (i = 0; i < nLines; ++i) {
+ line0 = lineArray[i];
+ col1 = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ col1 = (int)((line0->xMin - xMin) / fixedPitch + 0.5);
+ break;
+ case 1:
+ col1 = (int)((line0->yMin - yMin) / fixedPitch + 0.5);
+ break;
+ case 2:
+ col1 = (int)((xMax - line0->xMax) / fixedPitch + 0.5);
+ break;
+ case 3:
+ col1 = (int)((yMax - line0->yMax) / fixedPitch + 0.5);
+ break;
}
- if (col2 > col1) {
- col1 = col2;
+ for (k = 0; k <= line0->len; ++k) {
+ line0->col[k] += col1;
+ }
+ if (line0->col[line0->len] > nColumns) {
+ nColumns = line0->col[line0->len];
}
}
- for (k = 0; k <= line0->len; ++k) {
- line0->col[k] += col1;
- }
- if (line0->col[line0->len] > nColumns) {
- nColumns = line0->col[line0->len];
+ } else {
+ for (i = 0; i < nLines; ++i) {
+ line0 = lineArray[i];
+ col1 = 0;
+ for (j = 0; j < i; ++j) {
+ line1 = lineArray[j];
+ if (line1->primaryDelta(line0) >= 0) {
+ col2 = line1->col[line1->len] + 1;
+ } else {
+ k = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ for (k = 0;
+ k < line1->len &&
+ line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+ ++k) ;
+ break;
+ case 1:
+ for (k = 0;
+ k < line1->len &&
+ line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+ ++k) ;
+ break;
+ case 2:
+ for (k = 0;
+ k < line1->len &&
+ line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+ ++k) ;
+ break;
+ case 3:
+ for (k = 0;
+ k < line1->len &&
+ line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+ ++k) ;
+ break;
+ }
+ col2 = line1->col[k];
+ }
+ if (col2 > col1) {
+ col1 = col2;
+ }
+ }
+ for (k = 0; k <= line0->len; ++k) {
+ line0->col[k] += col1;
+ }
+ if (line0->col[line0->len] > nColumns) {
+ nColumns = line0->col[line0->len];
+ }
}
}
gfree(lineArray);
@@ -2111,6 +2142,8 @@ void TextPage::clear() {
gfree(blocks);
}
deleteGooList(fonts, TextFontInfo);
+ deleteGooList(underlines, TextUnderline);
+ deleteGooList(links, TextLink);
curWord = NULL;
charPos = 0;
@@ -2128,6 +2161,8 @@ void TextPage::clear() {
rawWords = NULL;
rawLastWord = NULL;
fonts = new GooList();
+ underlines = new GooList();
+ links = new GooList();
}
void TextPage::updateFont(GfxState *state) {
@@ -2426,7 +2461,7 @@ void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link)
links->append(new TextLink(xMin, yMin, xMax, yMax, link));
}
-void TextPage::coalesce(GBool physLayout, GBool doHTML) {
+void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) {
UnicodeMap *uMap;
TextPool *pool;
TextWord *word0, *word1, *word2;
@@ -2454,7 +2489,7 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
blkList = NULL;
lastBlk = NULL;
nBlocks = 0;
- primaryRot = -1;
+ primaryRot = 0;
#if 0 // for debugging
printf("*** initial words ***\n");
@@ -2918,7 +2953,7 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
//~ addition to primary rotation
// coalesce the block, and add it to the list
- blk->coalesce(uMap);
+ blk->coalesce(uMap, fixedPitch);
if (lastBlk) {
lastBlk->next = blk;
} else {
@@ -2926,11 +2961,12 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
}
lastBlk = blk;
count[rot] += blk->charCount;
- if (primaryRot < 0 || count[rot] > count[primaryRot]) {
- primaryRot = rot;
- }
++nBlocks;
}
+
+ if (count[rot] > count[primaryRot]) {
+ primaryRot = rot;
+ }
}
#if 0 // for debugging
@@ -2992,75 +3028,108 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
// sort blocks into xy order for column assignment
if (blocks)
gfree (blocks);
- blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
- for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
- blocks[i] = blk;
- }
- qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
+ if (physLayout && fixedPitch) {
- // column assignment
- for (i = 0; i < nBlocks; ++i) {
- blk0 = blocks[i];
- col1 = 0;
- for (j = 0; j < i; ++j) {
- blk1 = blocks[j];
- col2 = 0; // make gcc happy
+ blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
+ for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
+ blocks[i] = blk;
+ col1 = 0; // make gcc happy
switch (primaryRot) {
case 0:
- if (blk0->xMin > blk1->xMax) {
- col2 = blk1->col + blk1->nColumns + 3;
- } else if (blk1->xMax == blk1->xMin) {
- col2 = blk1->col;
- } else {
- col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
- (blk1->xMax - blk1->xMin)) *
- blk1->nColumns);
- }
+ col1 = (int)(blk->xMin / fixedPitch + 0.5);
break;
case 1:
- if (blk0->yMin > blk1->yMax) {
- col2 = blk1->col + blk1->nColumns + 3;
- } else if (blk1->yMax == blk1->yMin) {
- col2 = blk1->col;
- } else {
- col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
- (blk1->yMax - blk1->yMin)) *
- blk1->nColumns);
- }
+ col1 = (int)(blk->yMin / fixedPitch + 0.5);
break;
case 2:
- if (blk0->xMax < blk1->xMin) {
- col2 = blk1->col + blk1->nColumns + 3;
- } else if (blk1->xMin == blk1->xMax) {
- col2 = blk1->col;
- } else {
- col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
- (blk1->xMin - blk1->xMax)) *
- blk1->nColumns);
- }
+ col1 = (int)((pageWidth - blk->xMax) / fixedPitch + 0.5);
break;
case 3:
- if (blk0->yMax < blk1->yMin) {
- col2 = blk1->col + blk1->nColumns + 3;
- } else if (blk1->yMin == blk1->yMax) {
- col2 = blk1->col;
- } else {
- col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
- (blk1->yMin - blk1->yMax)) *
- blk1->nColumns);
- }
+ col1 = (int)((pageHeight - blk->yMax) / fixedPitch + 0.5);
break;
}
- if (col2 > col1) {
- col1 = col2;
+ blk->col = col1;
+ for (line = blk->lines; line; line = line->next) {
+ for (j = 0; j <= line->len; ++j) {
+ line->col[j] += col1;
+ }
}
}
- blk0->col = col1;
- for (line = blk0->lines; line; line = line->next) {
- for (j = 0; j <= line->len; ++j) {
- line->col[j] += col1;
+
+ } else {
+
+ // sort blocks into xy order for column assignment
+ blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
+ for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
+ blocks[i] = blk;
+ }
+ qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
+
+ // column assignment
+ for (i = 0; i < nBlocks; ++i) {
+ blk0 = blocks[i];
+ col1 = 0;
+ for (j = 0; j < i; ++j) {
+ blk1 = blocks[j];
+ col2 = 0; // make gcc happy
+ switch (primaryRot) {
+ case 0:
+ if (blk0->xMin > blk1->xMax) {
+ col2 = blk1->col + blk1->nColumns + 3;
+ } else if (blk1->xMax == blk1->xMin) {
+ col2 = blk1->col;
+ } else {
+ col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
+ (blk1->xMax - blk1->xMin)) *
+ blk1->nColumns);
+ }
+ break;
+ case 1:
+ if (blk0->yMin > blk1->yMax) {
+ col2 = blk1->col + blk1->nColumns + 3;
+ } else if (blk1->yMax == blk1->yMin) {
+ col2 = blk1->col;
+ } else {
+ col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
+ (blk1->yMax - blk1->yMin)) *
+ blk1->nColumns);
+ }
+ break;
+ case 2:
+ if (blk0->xMax < blk1->xMin) {
+ col2 = blk1->col + blk1->nColumns + 3;
+ } else if (blk1->xMin == blk1->xMax) {
+ col2 = blk1->col;
+ } else {
+ col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
+ (blk1->xMin - blk1->xMax)) *
+ blk1->nColumns);
+ }
+ break;
+ case 3:
+ if (blk0->yMax < blk1->yMin) {
+ col2 = blk1->col + blk1->nColumns + 3;
+ } else if (blk1->yMin == blk1->yMax) {
+ col2 = blk1->col;
+ } else {
+ col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
+ (blk1->yMin - blk1->yMax)) *
+ blk1->nColumns);
+ }
+ break;
+ }
+ if (col2 > col1) {
+ col1 = col2;
+ }
+ }
+ blk0->col = col1;
+ for (line = blk0->lines; line; line = line->next) {
+ for (j = 0; j <= line->len; ++j) {
+ line->col[j] += col1;
+ }
}
}
+
}
#if 0 // for debugging
@@ -3070,7 +3139,7 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
blk->nColumns);
for (line = blk->lines; line; line = line->next) {
- printf(" line:\n");
+ printf(" line: col[0]=%d\n", line->col[0]);
for (word0 = line->words; word0; word0 = word0->next) {
printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
word0->xMin, word0->xMax, word0->yMin, word0->yMax,
@@ -3470,6 +3539,7 @@ GBool TextPage::findText(Unicode *s, int len,
GBool startAtTop, GBool stopAtBottom,
GBool startAtLast, GBool stopAtLast,
GBool caseSensitive, GBool backward,
+ GBool wholeWord,
double *xMin, double *yMin,
double *xMax, double *yMax) {
TextBlock *blk;
@@ -3527,25 +3597,35 @@ GBool TextPage::findText(Unicode *s, int len,
blk = blocks[i];
// check: is the block above the top limit?
- if (!startAtTop && (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
+ // (this only works if the page's primary rotation is zero --
+ // otherwise the blocks won't be sorted in the useful order)
+ if (!startAtTop && primaryRot == 0 &&
+ (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
continue;
}
// check: is the block below the bottom limit?
- if (!stopAtBottom && (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
+ // (this only works if the page's primary rotation is zero --
+ // otherwise the blocks won't be sorted in the useful order)
+ if (!stopAtBottom && primaryRot == 0 &&
+ (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
break;
}
for (line = blk->lines; line; line = line->next) {
// check: is the line above the top limit?
- if (!startAtTop &&
+ // (this only works if the page's primary rotation is zero --
+ // otherwise the lines won't be sorted in the useful order)
+ if (!startAtTop && primaryRot == 0 &&
(backward ? line->yMin > yStart : line->yMin < yStart)) {
continue;
}
// check: is the line below the bottom limit?
- if (!stopAtBottom &&
+ // (this only works if the page's primary rotation is zero --
+ // otherwise the lines won't be sorted in the useful order)
+ if (!stopAtBottom && primaryRot == 0 &&
(backward ? line->yMin < yStop : line->yMin > yStop)) {
continue;
}
@@ -3564,9 +3644,9 @@ GBool TextPage::findText(Unicode *s, int len,
for (k = 0; k < m; ++k) {
txt[k] = unicodeToUpper(line->normalized[k]);
}
- } else {
+ } else {
txt = line->normalized;
- }
+ }
// search each position in this line
j = backward ? m - len : 0;
@@ -5211,9 +5291,11 @@ static void TextOutputDev_outputToFile(void *stream, const char *text, int len)
}
TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
- GBool rawOrderA, GBool append) {
+ double fixedPitchA, GBool rawOrderA,
+ GBool append) {
text = NULL;
physLayout = physLayoutA;
+ fixedPitch = physLayout ? fixedPitchA : 0;
rawOrder = rawOrderA;
doHTML = gFalse;
ok = gTrue;
@@ -5246,11 +5328,13 @@ TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
}
TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
- GBool physLayoutA, GBool rawOrderA) {
+ GBool physLayoutA, double fixedPitchA,
+ GBool rawOrderA) {
outputFunc = func;
outputStream = stream;
needClose = gFalse;
physLayout = physLayoutA;
+ fixedPitch = physLayout ? fixedPitchA : 0;
rawOrder = rawOrderA;
doHTML = gFalse;
text = new TextPage(rawOrderA);
@@ -5277,12 +5361,16 @@ void TextOutputDev::startPage(int pageNum, GfxState *state) {
void TextOutputDev::endPage() {
text->endPage();
- text->coalesce(physLayout, doHTML);
+ text->coalesce(physLayout, fixedPitch, doHTML);
if (outputStream) {
text->dump(outputStream, outputFunc, physLayout);
}
}
+void TextOutputDev::restoreState(GfxState *state) {
+ text->updateFont(state);
+}
+
void TextOutputDev::updateFont(GfxState *state) {
text->updateFont(state);
}
@@ -5465,10 +5553,12 @@ GBool TextOutputDev::findText(Unicode *s, int len,
GBool startAtTop, GBool stopAtBottom,
GBool startAtLast, GBool stopAtLast,
GBool caseSensitive, GBool backward,
+ GBool wholeWord,
double *xMin, double *yMin,
double *xMax, double *yMax) {
return text->findText(s, len, startAtTop, stopAtBottom,
- startAtLast, stopAtLast, caseSensitive, backward,
+ startAtLast, stopAtLast,
+ caseSensitive, backward, wholeWord,
xMin, yMin, xMax, yMax);
}
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index fd34c8bb..e31876b9 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -333,7 +333,7 @@ public:
void addWord(TextWord *word);
- void coalesce(UnicodeMap *uMap);
+ void coalesce(UnicodeMap *uMap, double fixedPitch);
// Update this block's priMin and priMax values, looking at <blk>.
void updatePriMinMax(TextBlock *blk);
@@ -521,7 +521,7 @@ public:
void addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link);
// Coalesce strings that look like parts of the same line.
- void coalesce(GBool physLayout, GBool doHTML);
+ void coalesce(GBool physLayout, double fixedPitch, GBool doHTML);
// Find a string. If <startAtTop> is true, starts looking at the
// top of the page; else if <startAtLast> is true, starts looking
@@ -534,6 +534,7 @@ public:
GBool startAtTop, GBool stopAtBottom,
GBool startAtLast, GBool stopAtLast,
GBool caseSensitive, GBool backward,
+ GBool wholeWord,
double *xMin, double *yMin,
double *xMax, double *yMax);
@@ -676,14 +677,16 @@ public:
// is maintained. If <rawOrder> is true, the text is kept in
// content stream order.
TextOutputDev(char *fileName, GBool physLayoutA,
- GBool rawOrderA, GBool append);
+ double fixedPitchA, GBool rawOrderA,
+ GBool append);
// Create a TextOutputDev which will write to a generic stream. If
// <physLayoutA> is true, the original physical layout of the text
// is maintained. If <rawOrder> is true, the text is kept in
// content stream order.
TextOutputDev(TextOutputFunc func, void *stream,
- GBool physLayoutA, GBool rawOrderA);
+ GBool physLayoutA, double fixedPitchA,
+ GBool rawOrderA);
// Destructor.
virtual ~TextOutputDev();
@@ -719,6 +722,9 @@ public:
// End a page.
virtual void endPage();
+ //----- save/restore graphics state
+ virtual void restoreState(GfxState *state);
+
//----- update text state
virtual void updateFont(GfxState *state);
@@ -754,6 +760,7 @@ public:
GBool startAtTop, GBool stopAtBottom,
GBool startAtLast, GBool stopAtLast,
GBool caseSensitive, GBool backward,
+ GBool wholeWord,
double *xMin, double *yMin,
double *xMax, double *yMax);
@@ -804,6 +811,9 @@ private:
TextPage *text; // text for the current page
GBool physLayout; // maintain original physical layout when
// dumping text
+ double fixedPitch; // if physLayout is true and this is non-zero,
+ // assume fixed-pitch characters with this
+ // width
GBool rawOrder; // keep text in content stream order
GBool doHTML; // extra processing for HTML conversion
GBool ok; // set up ok?
diff --git a/qt4/src/poppler-page.cc b/qt4/src/poppler-page.cc
index 9dc1d15b..398a69b8 100644
--- a/qt4/src/poppler-page.cc
+++ b/qt4/src/poppler-page.cc
@@ -330,7 +330,7 @@ QString Page::text(const QRectF &r, TextLayout textLayout) const
QString result;
const GBool rawOrder = textLayout == RawOrderLayout;
- output_dev = new TextOutputDev(0, gFalse, rawOrder, gFalse);
+ output_dev = new TextOutputDev(0, gFalse, 0, rawOrder, gFalse);
m_page->parentDoc->doc->displayPageSlice(output_dev, m_page->index + 1, 72, 72,
0, false, true, false, -1, -1, -1, -1);
if (r.isNull())
@@ -371,19 +371,19 @@ bool Page::search(const QString &text, double &sLeft, double &sTop, double &sRig
int rotation = (int)rotate * 90;
// fetch ourselves a textpage
- TextOutputDev td(NULL, gTrue, gFalse, gFalse);
+ TextOutputDev td(NULL, gTrue, 0, gFalse, gFalse);
m_page->parentDoc->doc->displayPage( &td, m_page->index + 1, 72, 72, rotation, false, true, false );
TextPage *textPage=td.takeText();
if (direction == FromTop)
found = textPage->findText( u.data(), len,
- gTrue, gTrue, gFalse, gFalse, sCase, gFalse, &sLeft, &sTop, &sRight, &sBottom );
+ gTrue, gTrue, gFalse, gFalse, sCase, gFalse, gFalse, &sLeft, &sTop, &sRight, &sBottom );
else if ( direction == NextResult )
found = textPage->findText( u.data(), len,
- gFalse, gTrue, gTrue, gFalse, sCase, gFalse, &sLeft, &sTop, &sRight, &sBottom );
+ gFalse, gTrue, gTrue, gFalse, sCase, gFalse, gFalse, &sLeft, &sTop, &sRight, &sBottom );
else if ( direction == PreviousResult )
found = textPage->findText( u.data(), len,
- gFalse, gTrue, gTrue, gFalse, sCase, gTrue, &sLeft, &sTop, &sRight, &sBottom );
+ gFalse, gTrue, gTrue, gFalse, sCase, gTrue, gFalse, &sLeft, &sTop, &sRight, &sBottom );
textPage->decRefCnt();
@@ -414,7 +414,7 @@ QList<TextBox*> Page::textList(Rotation rotate) const
QList<TextBox*> output_list;
- output_dev = new TextOutputDev(0, gFalse, gFalse, gFalse);
+ output_dev = new TextOutputDev(0, gFalse, 0, gFalse, gFalse);
int rotation = (int)rotate * 90;
diff --git a/test/perf-test.cc b/test/perf-test.cc
index b6fb8f89..6bdda97d 100644
--- a/test/perf-test.cc
+++ b/test/perf-test.cc
@@ -840,7 +840,7 @@ static void RenderPdfAsText(const char *fileName)
LogInfo("started: %s\n", fileName);
- TextOutputDev * textOut = new TextOutputDev(NULL, gTrue, gFalse, gFalse);
+ TextOutputDev * textOut = new TextOutputDev(NULL, gTrue, 0, gFalse, gFalse);
if (!textOut->isOk()) {
delete textOut;
return;
diff --git a/utils/pdftotext.1 b/utils/pdftotext.1
index 587f1a9c..88fbf702 100644
--- a/utils/pdftotext.1
+++ b/utils/pdftotext.1
@@ -53,6 +53,10 @@ Maintain (as best as possible) the original physical layout of the
text. The default is to \'undo' physical layout (columns,
hyphenation, etc.) and output the text in reading order.
.TP
+.BI \-fixed " number"
+Assume fixed-pitch (or tabular) text, with the specified character
+width (in points). This forces physical layout mode.
+.TP
.B \-raw
Keep the text in content stream order. This is a hack which often
"undoes" column formatting, etc. Use of raw mode is no longer
diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc
index 2e7b32e5..a170f1b7 100644
--- a/utils/pdftotext.cc
+++ b/utils/pdftotext.cc
@@ -68,6 +68,7 @@ static int w = 0;
static int h = 0;
static GBool bbox = gFalse;
static GBool physLayout = gFalse;
+static double fixedPitch = 0;
static GBool rawOrder = gFalse;
static GBool htmlMeta = gFalse;
static char textEncName[128] = "";
@@ -97,6 +98,8 @@ static const ArgDesc argDesc[] = {
"height of crop area in pixels (default is 0)"},
{"-layout", argFlag, &physLayout, 0,
"maintain original physical layout"},
+ {"-fixed", argFP, &fixedPitch, 0,
+ "assume fixed-pitch (or tabular) text"},
{"-raw", argFlag, &rawOrder, 0,
"keep strings in content stream order"},
{"-htmlmeta", argFlag, &htmlMeta, 0,
@@ -197,6 +200,9 @@ int main(int argc, char *argv[]) {
}
fileName = new GooString(argv[1]);
+ if (fixedPitch) {
+ physLayout = gTrue;
+ }
if (textEncName[0]) {
globalParams->setTextEncoding(textEncName);
@@ -333,7 +339,7 @@ int main(int argc, char *argv[]) {
// write text file
if (bbox) {
- textOut = new TextOutputDev(NULL, physLayout, rawOrder, htmlMeta);
+ textOut = new TextOutputDev(NULL, physLayout, fixedPitch, rawOrder, htmlMeta);
if (!(f = fopen(textFileName->getCString(), "ab"))) {
error(errIO, -1, "Couldn't open text file '{0:t}' for append", textFileName);
exitCode = 2;
@@ -367,7 +373,7 @@ int main(int argc, char *argv[]) {
fclose(f);
} else {
textOut = new TextOutputDev(textFileName->getCString(),
- physLayout, rawOrder, htmlMeta);
+ physLayout, fixedPitch, rawOrder, htmlMeta);
if (textOut->isOk()) {
if ((w==0) && (h==0) && (x==0) && (y==0)) {
doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0,