From 37e73b9ada1fcbc25094bca7bbf2ad0647bb35f6 Mon Sep 17 00:00:00 2001 From: Adrian Perez de Castro Date: Tue, 18 Jun 2013 00:35:51 +0300 Subject: Tagged-PDF: Text content extraction from structure elements Implement StructElement::getText(), by using MCOutputDev. This output device captures the a sequence MCOp structures representing the text drawing operations for a particular marked content text object from the page stream. Those are then used to convert the individual Unicode characters to the returned string. --- poppler/MCOutputDev.cc | 160 +++++++++++++++++++++++++++++++++++++++++++++++ poppler/MCOutputDev.h | 113 +++++++++++++++++++++++++++++++++ poppler/Makefile.am | 2 + poppler/StructElement.cc | 67 ++++++++++++++++++++ poppler/StructElement.h | 19 ++++++ 5 files changed, 361 insertions(+) create mode 100644 poppler/MCOutputDev.cc create mode 100644 poppler/MCOutputDev.h diff --git a/poppler/MCOutputDev.cc b/poppler/MCOutputDev.cc new file mode 100644 index 00000000..4eb080dc --- /dev/null +++ b/poppler/MCOutputDev.cc @@ -0,0 +1,160 @@ +//======================================================================== +// +// MCOutputDev.cc +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#include "MCOutputDev.h" +#include "GfxFont.h" +#include "GfxState.h" +#include "Annot.h" +#include "Link.h" +#include + +struct MCOutputDev::Priv +{ + MCOpArray mcOps; + bool capturing; + int mcid; + GfxFont *lastFont; + Guint lastFlags; + double pageWidth; + double pageHeight; + + Priv(int mcidA): + capturing(false), + mcid(mcidA), + lastFont(0), + lastFlags(0), + pageWidth(0.0), + pageHeight(0.0) + {} +}; + + +MCOutputDev::MCOutputDev(int mcid): + p(new Priv(mcid)) +{ +} + + +MCOutputDev::~MCOutputDev() +{ + delete p; +} + + +void MCOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) +{ + if (state) { + p->pageWidth = state->getPageWidth(); + p->pageHeight = state->getPageHeight(); + } else { + p->pageWidth = p->pageHeight = 0.0; + } +} + + +void MCOutputDev::endPage() +{ + p->pageWidth = p->pageHeight = 0.0; +} + + +void MCOutputDev::beginMarkedContent(char *name, Dict *properties) +{ + int id = -1; + if (properties && properties->lookupInt("MCID", NULL, &id) && id == p->mcid) + p->capturing = true; +} + + +void MCOutputDev::endMarkedContent(GfxState *state) +{ + p->capturing = false; +} + + +void MCOutputDev::drawChar(GfxState *state, + double xx, double yy, + double dx, double dy, + double ox, double oy, + CharCode c, int nBytes, + Unicode *u, int uLen) +{ + if (!p->capturing || !uLen) + return; + + double sp, dx2, dy2, w1, h1, x1, y1; + + // Subtract char and word spacing from the (dx,dy) values + sp = state->getCharSpace(); + if (c == (CharCode) 0x20) + sp += state->getWordSpace(); + state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2); + dx -= dx2; + dy -= dy2; + state->transformDelta(dx, dy, &w1, &h1); + state->transform(xx, yy, &x1, &y1); + + // Throw away characters that are not inside the page boundaries. + if (x1 + w1 < 0 || x1 > p->pageWidth || y1 + h1 < 0 || y1 > p->pageHeight) + return; + + // Make a sanity check on character size. Note: (x != x) <-> isnan(x) + if (x1 != x1 || y1 != y1 || w1 != w1 || h1 != h1) + return; + + for (int i = 0; i < uLen; i++) { + // Soft hyphen markers are skipped, as they are invisible unless + // rendering is done to an actual device and the hyphenation hint + // used. MCOutputDev extracts the *visible* text content. + if (u[i] != 0x00AD) + p->mcOps.push_back(MCOp(u[i])); + } +} + + +void MCOutputDev::updateFont(GfxState *state) +{ + GfxFont *font = state->getFont(); + if (!font || font == p->lastFont) return; + + if (!p->lastFont || (p->lastFont->getFamily() && p->lastFont->getFamily()->cmp(font->getFamily()))) { + if (p->capturing && font->getFamily()) + p->mcOps.push_back(MCOp(font->getFamily()->getCString())); + if (p->lastFont) p->lastFont->decRefCnt(); + p->lastFont = font; + font->incRefCnt(); + } + + Guint flags = 0; + + if (font->isFixedWidth()) flags |= MCOp::FlagFontItalic; + if (font->isItalic()) flags |= MCOp::FlagFontItalic; + if (font->isBold()) flags |= MCOp::FlagFontBold; + else { + switch (font->getWeight()) { + case GfxFont::W700: // Font weights over 600 are bold + case GfxFont::W800: + case GfxFont::W900: + flags |= MCOp::FlagFontBold; + default: + break; + } + } + + if (p->lastFlags != flags) { + if (p->capturing) + p->mcOps.push_back(MCOp(MCOp::Flags, flags)); + p->lastFlags = flags; + } +} + + +const MCOpArray& MCOutputDev::getMCOps() const +{ + return p->mcOps; +} diff --git a/poppler/MCOutputDev.h b/poppler/MCOutputDev.h new file mode 100644 index 00000000..d2efc8e8 --- /dev/null +++ b/poppler/MCOutputDev.h @@ -0,0 +1,113 @@ +//======================================================================== +// +// MCOutputDev.h +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#ifndef MCDOUTPUTDEV_H +#define MCDOUTPUTDEV_H + +#include "goo/gtypes.h" +#include "goo/gmem.h" +#include "OutputDev.h" +#include + +class GfxState; +class GooString; +class Dict; + +struct MCOp { + enum Type { + Unichar, + FontName, + Flags, + Color, + }; + + struct OpColor { + double r, g, b; + + Guint rgbPixel() const { + return ((Guint) (r * 255) & 0xFF) << 16 + | ((Guint) (g * 255) & 0xFF) << 8 + | ((Guint) (b * 255) & 0xFF); + } + }; + + enum Flags { + FlagFontBold = (1 << 0), + FlagFontItalic = (1 << 1), + FlagFontFixed = (1 << 2), + }; + + Type type; + + union { + Unicode unichar; + char *value; + Guint flags; + OpColor color; + }; + + MCOp(const MCOp &op): type(op.type) { + switch (type) { + case Flags: flags = op.flags; break; + case Unichar: unichar = op.unichar; break; + case Color: memcpy(&color, &op.color, sizeof(OpColor)); break; + case FontName: value = strdup(op.value); break; + } + } + ~MCOp() { + switch (type) { + case FontName: gfree(value); break; + default: /* nothing */ break; + } + } + MCOp(): type(FontName), value(NULL) {} + MCOp(Unicode u): type(Unichar), unichar(u) {} + MCOp(const char *s): type(FontName), value(strdup(s)) {} + MCOp(Type t, Guint f = 0): type(t), flags(f) {} +}; + + +typedef std::vector MCOpArray; + + +class MCOutputDev: public OutputDev { +public: + MCOutputDev(int mcid); + virtual ~MCOutputDev(); + + virtual GBool isOk() { return gTrue; } + virtual GBool upsideDown() { return gTrue; } + virtual GBool useDrawChar() { return gTrue; } + virtual GBool interpretType3Chars() { return gFalse; } + virtual GBool needNonText() { return gFalse; } + virtual GBool needCharCount() { return gFalse; } + + virtual void startPage(int pageNum, GfxState *state, XRef *xref); + virtual void endPage(); + + virtual void restoreState(GfxState *state) { updateFont(state); } + virtual void updateFont(GfxState *state); + + virtual void drawChar(GfxState *state, + double xx, double yy, + double dx, double dy, + double ox, double oy, + CharCode c, int nBytes, + Unicode *u, int uLen); + + virtual void beginMarkedContent(char *name, Dict *properties); + virtual void endMarkedContent(GfxState *state); + + const MCOpArray& getMCOps() const; + +private: + struct Priv; + Priv *p; +}; + +#endif /* !MCOUTPUTDEV_H */ diff --git a/poppler/Makefile.am b/poppler/Makefile.am index 7e5f3c66..dfbcb8d6 100644 --- a/poppler/Makefile.am +++ b/poppler/Makefile.am @@ -232,6 +232,7 @@ poppler_include_HEADERS = \ NameToUnicodeTable.h \ PSOutputDev.h \ TextOutputDev.h \ + MCOutputDev.h \ SecurityHandler.h \ UTF.h \ UTF8.h \ @@ -306,6 +307,7 @@ libpoppler_la_SOURCES = \ XRef.cc \ PSOutputDev.cc \ TextOutputDev.cc \ + MCOutputDev.cc \ PageLabelInfo.h \ PageLabelInfo.cc \ SecurityHandler.cc \ diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc index 498f0751..894114ef 100644 --- a/poppler/StructElement.cc +++ b/poppler/StructElement.cc @@ -14,6 +14,8 @@ #include "StructElement.h" #include "StructTreeRoot.h" +#include "GlobalParams.h" +#include "UnicodeMap.h" #include "PDFDoc.h" #include "Dict.h" @@ -976,6 +978,71 @@ const Attribute *StructElement::findAttribute(Attribute::Type attributeType, GBo return NULL; } +GooString *StructElement::getText(GooString *string, GBool recursive) const +{ + if (isContent() && !isObjectRef()) { + const MCOpArray& ops(getMCOps()); + if (!ops.size()) + return NULL; + + UnicodeMap *map = globalParams->getTextEncoding(); + if (!map) { + GooString mapName("UTF-8"); + map = UnicodeMap::parse(&mapName); + } + assert(map); + + if (!string) + string = new GooString(); + + char buf[9]; + int n; + + for (MCOpArray::const_iterator i = ops.begin(); i != ops.end(); ++i) { + if (i->type == MCOp::Unichar) { + n = map->mapUnicode(i->unichar, buf, sizeof(buf)); + string->append(buf, n); + } + } + map->decRefCnt(); + return string; + } + + if (!recursive) + return NULL; + + // Do a depth-first traversal, to get elements in logical order + if (!string) + string = new GooString(); + + for (unsigned i = 0; i < getNumElements(); i++) + getElement(i)->getText(string, recursive); + + return string; +} + +const MCOpArray StructElement::getMCOps() const +{ + if (!isContent()) + return MCOpArray(); // Empty array + + MCOutputDev mcdev(getMCID()); + int startPage = 0, endPage = 0; + + Ref ref; + if (getPageRef(ref)) { + startPage = endPage = treeRoot->getDoc()->findPage(ref.num, ref.gen); + } + + if (!(startPage && endPage)) { + startPage = 1; + endPage = treeRoot->getDoc()->getNumPages(); + } + + treeRoot->getDoc()->displayPages(&mcdev, startPage, endPage, 72.0, 72.0, 0, gTrue, gFalse, gFalse); + return mcdev.getMCOps(); +} + static StructElement::Type roleMapResolve(Dict *roleMap, const char *name, const char *curName, Object *resolved) { // Circular reference diff --git a/poppler/StructElement.h b/poppler/StructElement.h index 977e4451..2db11fdf 100644 --- a/poppler/StructElement.h +++ b/poppler/StructElement.h @@ -17,6 +17,7 @@ #include "goo/gtypes.h" #include "goo/GooString.h" +#include "MCOutputDev.h" #include "Object.h" #include #include @@ -219,6 +220,24 @@ public: const GooString *getActualText() const { return isContent() ? NULL : s->actualText; } GooString *getActualText() { return isContent() ? NULL : s->actualText; } + // Content text referenced by the element: + // + // - For MCID reference elements, this is just the text of the + // corresponding marked content object in the page stream, regardless + // of the setting of the "recursive" flag. + // - For other elements, if the "recursive" flag is set, the text + // enclosed by *all* the child MCID reference elements of the subtree + // is returned. The text is assembled by traversing the leaf MCID + // reference elements in logical order. + // - In any other case, the function returns NULL. + // + // The text will be appended to the passed GooString. If NULL is passed, + // a new string is returned, and the ownership passed to the caller. + // + GooString *getText(GooString *string = NULL, GBool recursive = gTrue) const; + + const MCOpArray getMCOps() const; + ~StructElement(); private: -- cgit v1.2.3