summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdrian Perez de Castro <aperez@igalia.com>2013-06-18 00:35:51 +0300
committerCarlos Garcia Campos <carlosgc@gnome.org>2013-09-30 10:12:34 +0200
commit37e73b9ada1fcbc25094bca7bbf2ad0647bb35f6 (patch)
tree33d41b1383b3b3b2aaa5931f3cbfc5f00efd8c0b
parent7cabf51ad9a545414b3e78bb194eeafa50dfaea4 (diff)
Tagged-PDF: Text content extraction from structure elements
Implement StructElement::getText(), by using MCOutputDev. This output device captures the a sequence MCOp structures representing the text drawing operations for a particular marked content text object from the page stream. Those are then used to convert the individual Unicode characters to the returned string.
-rw-r--r--poppler/MCOutputDev.cc160
-rw-r--r--poppler/MCOutputDev.h113
-rw-r--r--poppler/Makefile.am2
-rw-r--r--poppler/StructElement.cc67
-rw-r--r--poppler/StructElement.h19
5 files changed, 361 insertions, 0 deletions
diff --git a/poppler/MCOutputDev.cc b/poppler/MCOutputDev.cc
new file mode 100644
index 00000000..4eb080dc
--- /dev/null
+++ b/poppler/MCOutputDev.cc
@@ -0,0 +1,160 @@
+//========================================================================
+//
+// MCOutputDev.cc
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#include "MCOutputDev.h"
+#include "GfxFont.h"
+#include "GfxState.h"
+#include "Annot.h"
+#include "Link.h"
+#include <vector>
+
+struct MCOutputDev::Priv
+{
+ MCOpArray mcOps;
+ bool capturing;
+ int mcid;
+ GfxFont *lastFont;
+ Guint lastFlags;
+ double pageWidth;
+ double pageHeight;
+
+ Priv(int mcidA):
+ capturing(false),
+ mcid(mcidA),
+ lastFont(0),
+ lastFlags(0),
+ pageWidth(0.0),
+ pageHeight(0.0)
+ {}
+};
+
+
+MCOutputDev::MCOutputDev(int mcid):
+ p(new Priv(mcid))
+{
+}
+
+
+MCOutputDev::~MCOutputDev()
+{
+ delete p;
+}
+
+
+void MCOutputDev::startPage(int pageNum, GfxState *state, XRef *xref)
+{
+ if (state) {
+ p->pageWidth = state->getPageWidth();
+ p->pageHeight = state->getPageHeight();
+ } else {
+ p->pageWidth = p->pageHeight = 0.0;
+ }
+}
+
+
+void MCOutputDev::endPage()
+{
+ p->pageWidth = p->pageHeight = 0.0;
+}
+
+
+void MCOutputDev::beginMarkedContent(char *name, Dict *properties)
+{
+ int id = -1;
+ if (properties && properties->lookupInt("MCID", NULL, &id) && id == p->mcid)
+ p->capturing = true;
+}
+
+
+void MCOutputDev::endMarkedContent(GfxState *state)
+{
+ p->capturing = false;
+}
+
+
+void MCOutputDev::drawChar(GfxState *state,
+ double xx, double yy,
+ double dx, double dy,
+ double ox, double oy,
+ CharCode c, int nBytes,
+ Unicode *u, int uLen)
+{
+ if (!p->capturing || !uLen)
+ return;
+
+ double sp, dx2, dy2, w1, h1, x1, y1;
+
+ // Subtract char and word spacing from the (dx,dy) values
+ sp = state->getCharSpace();
+ if (c == (CharCode) 0x20)
+ sp += state->getWordSpace();
+ state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
+ dx -= dx2;
+ dy -= dy2;
+ state->transformDelta(dx, dy, &w1, &h1);
+ state->transform(xx, yy, &x1, &y1);
+
+ // Throw away characters that are not inside the page boundaries.
+ if (x1 + w1 < 0 || x1 > p->pageWidth || y1 + h1 < 0 || y1 > p->pageHeight)
+ return;
+
+ // Make a sanity check on character size. Note: (x != x) <-> isnan(x)
+ if (x1 != x1 || y1 != y1 || w1 != w1 || h1 != h1)
+ return;
+
+ for (int i = 0; i < uLen; i++) {
+ // Soft hyphen markers are skipped, as they are invisible unless
+ // rendering is done to an actual device and the hyphenation hint
+ // used. MCOutputDev extracts the *visible* text content.
+ if (u[i] != 0x00AD)
+ p->mcOps.push_back(MCOp(u[i]));
+ }
+}
+
+
+void MCOutputDev::updateFont(GfxState *state)
+{
+ GfxFont *font = state->getFont();
+ if (!font || font == p->lastFont) return;
+
+ if (!p->lastFont || (p->lastFont->getFamily() && p->lastFont->getFamily()->cmp(font->getFamily()))) {
+ if (p->capturing && font->getFamily())
+ p->mcOps.push_back(MCOp(font->getFamily()->getCString()));
+ if (p->lastFont) p->lastFont->decRefCnt();
+ p->lastFont = font;
+ font->incRefCnt();
+ }
+
+ Guint flags = 0;
+
+ if (font->isFixedWidth()) flags |= MCOp::FlagFontItalic;
+ if (font->isItalic()) flags |= MCOp::FlagFontItalic;
+ if (font->isBold()) flags |= MCOp::FlagFontBold;
+ else {
+ switch (font->getWeight()) {
+ case GfxFont::W700: // Font weights over 600 are bold
+ case GfxFont::W800:
+ case GfxFont::W900:
+ flags |= MCOp::FlagFontBold;
+ default:
+ break;
+ }
+ }
+
+ if (p->lastFlags != flags) {
+ if (p->capturing)
+ p->mcOps.push_back(MCOp(MCOp::Flags, flags));
+ p->lastFlags = flags;
+ }
+}
+
+
+const MCOpArray& MCOutputDev::getMCOps() const
+{
+ return p->mcOps;
+}
diff --git a/poppler/MCOutputDev.h b/poppler/MCOutputDev.h
new file mode 100644
index 00000000..d2efc8e8
--- /dev/null
+++ b/poppler/MCOutputDev.h
@@ -0,0 +1,113 @@
+//========================================================================
+//
+// MCOutputDev.h
+//
+// Copyright 2013 Igalia S.L.
+//
+//========================================================================
+
+#ifndef MCDOUTPUTDEV_H
+#define MCDOUTPUTDEV_H
+
+#include "goo/gtypes.h"
+#include "goo/gmem.h"
+#include "OutputDev.h"
+#include <vector>
+
+class GfxState;
+class GooString;
+class Dict;
+
+struct MCOp {
+ enum Type {
+ Unichar,
+ FontName,
+ Flags,
+ Color,
+ };
+
+ struct OpColor {
+ double r, g, b;
+
+ Guint rgbPixel() const {
+ return ((Guint) (r * 255) & 0xFF) << 16
+ | ((Guint) (g * 255) & 0xFF) << 8
+ | ((Guint) (b * 255) & 0xFF);
+ }
+ };
+
+ enum Flags {
+ FlagFontBold = (1 << 0),
+ FlagFontItalic = (1 << 1),
+ FlagFontFixed = (1 << 2),
+ };
+
+ Type type;
+
+ union {
+ Unicode unichar;
+ char *value;
+ Guint flags;
+ OpColor color;
+ };
+
+ MCOp(const MCOp &op): type(op.type) {
+ switch (type) {
+ case Flags: flags = op.flags; break;
+ case Unichar: unichar = op.unichar; break;
+ case Color: memcpy(&color, &op.color, sizeof(OpColor)); break;
+ case FontName: value = strdup(op.value); break;
+ }
+ }
+ ~MCOp() {
+ switch (type) {
+ case FontName: gfree(value); break;
+ default: /* nothing */ break;
+ }
+ }
+ MCOp(): type(FontName), value(NULL) {}
+ MCOp(Unicode u): type(Unichar), unichar(u) {}
+ MCOp(const char *s): type(FontName), value(strdup(s)) {}
+ MCOp(Type t, Guint f = 0): type(t), flags(f) {}
+};
+
+
+typedef std::vector<MCOp> MCOpArray;
+
+
+class MCOutputDev: public OutputDev {
+public:
+ MCOutputDev(int mcid);
+ virtual ~MCOutputDev();
+
+ virtual GBool isOk() { return gTrue; }
+ virtual GBool upsideDown() { return gTrue; }
+ virtual GBool useDrawChar() { return gTrue; }
+ virtual GBool interpretType3Chars() { return gFalse; }
+ virtual GBool needNonText() { return gFalse; }
+ virtual GBool needCharCount() { return gFalse; }
+
+ virtual void startPage(int pageNum, GfxState *state, XRef *xref);
+ virtual void endPage();
+
+ virtual void restoreState(GfxState *state) { updateFont(state); }
+ virtual void updateFont(GfxState *state);
+
+ virtual void drawChar(GfxState *state,
+ double xx, double yy,
+ double dx, double dy,
+ double ox, double oy,
+ CharCode c, int nBytes,
+ Unicode *u, int uLen);
+
+ virtual void beginMarkedContent(char *name, Dict *properties);
+ virtual void endMarkedContent(GfxState *state);
+
+ const MCOpArray& getMCOps() const;
+
+private:
+ struct Priv;
+ Priv *p;
+};
+
+#endif /* !MCOUTPUTDEV_H */
diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index 7e5f3c66..dfbcb8d6 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -232,6 +232,7 @@ poppler_include_HEADERS = \
NameToUnicodeTable.h \
PSOutputDev.h \
TextOutputDev.h \
+ MCOutputDev.h \
SecurityHandler.h \
UTF.h \
UTF8.h \
@@ -306,6 +307,7 @@ libpoppler_la_SOURCES = \
XRef.cc \
PSOutputDev.cc \
TextOutputDev.cc \
+ MCOutputDev.cc \
PageLabelInfo.h \
PageLabelInfo.cc \
SecurityHandler.cc \
diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc
index 498f0751..894114ef 100644
--- a/poppler/StructElement.cc
+++ b/poppler/StructElement.cc
@@ -14,6 +14,8 @@
#include "StructElement.h"
#include "StructTreeRoot.h"
+#include "GlobalParams.h"
+#include "UnicodeMap.h"
#include "PDFDoc.h"
#include "Dict.h"
@@ -976,6 +978,71 @@ const Attribute *StructElement::findAttribute(Attribute::Type attributeType, GBo
return NULL;
}
+GooString *StructElement::getText(GooString *string, GBool recursive) const
+{
+ if (isContent() && !isObjectRef()) {
+ const MCOpArray& ops(getMCOps());
+ if (!ops.size())
+ return NULL;
+
+ UnicodeMap *map = globalParams->getTextEncoding();
+ if (!map) {
+ GooString mapName("UTF-8");
+ map = UnicodeMap::parse(&mapName);
+ }
+ assert(map);
+
+ if (!string)
+ string = new GooString();
+
+ char buf[9];
+ int n;
+
+ for (MCOpArray::const_iterator i = ops.begin(); i != ops.end(); ++i) {
+ if (i->type == MCOp::Unichar) {
+ n = map->mapUnicode(i->unichar, buf, sizeof(buf));
+ string->append(buf, n);
+ }
+ }
+ map->decRefCnt();
+ return string;
+ }
+
+ if (!recursive)
+ return NULL;
+
+ // Do a depth-first traversal, to get elements in logical order
+ if (!string)
+ string = new GooString();
+
+ for (unsigned i = 0; i < getNumElements(); i++)
+ getElement(i)->getText(string, recursive);
+
+ return string;
+}
+
+const MCOpArray StructElement::getMCOps() const
+{
+ if (!isContent())
+ return MCOpArray(); // Empty array
+
+ MCOutputDev mcdev(getMCID());
+ int startPage = 0, endPage = 0;
+
+ Ref ref;
+ if (getPageRef(ref)) {
+ startPage = endPage = treeRoot->getDoc()->findPage(ref.num, ref.gen);
+ }
+
+ if (!(startPage && endPage)) {
+ startPage = 1;
+ endPage = treeRoot->getDoc()->getNumPages();
+ }
+
+ treeRoot->getDoc()->displayPages(&mcdev, startPage, endPage, 72.0, 72.0, 0, gTrue, gFalse, gFalse);
+ return mcdev.getMCOps();
+}
+
static StructElement::Type roleMapResolve(Dict *roleMap, const char *name, const char *curName, Object *resolved)
{
// Circular reference
diff --git a/poppler/StructElement.h b/poppler/StructElement.h
index 977e4451..2db11fdf 100644
--- a/poppler/StructElement.h
+++ b/poppler/StructElement.h
@@ -17,6 +17,7 @@
#include "goo/gtypes.h"
#include "goo/GooString.h"
+#include "MCOutputDev.h"
#include "Object.h"
#include <vector>
#include <set>
@@ -219,6 +220,24 @@ public:
const GooString *getActualText() const { return isContent() ? NULL : s->actualText; }
GooString *getActualText() { return isContent() ? NULL : s->actualText; }
+ // Content text referenced by the element:
+ //
+ // - For MCID reference elements, this is just the text of the
+ // corresponding marked content object in the page stream, regardless
+ // of the setting of the "recursive" flag.
+ // - For other elements, if the "recursive" flag is set, the text
+ // enclosed by *all* the child MCID reference elements of the subtree
+ // is returned. The text is assembled by traversing the leaf MCID
+ // reference elements in logical order.
+ // - In any other case, the function returns NULL.
+ //
+ // The text will be appended to the passed GooString. If NULL is passed,
+ // a new string is returned, and the ownership passed to the caller.
+ //
+ GooString *getText(GooString *string = NULL, GBool recursive = gTrue) const;
+
+ const MCOpArray getMCOps() const;
+
~StructElement();
private: