diff options
author | Adrian Johnson <ajohnson@redneon.com> | 2023-10-02 12:25:22 +1030 |
---|---|---|
committer | Adrian Johnson <ajohnson@redneon.com> | 2023-10-08 21:41:17 +1030 |
commit | b4052d02b44e8e412316a8dca1a99f9714e5aa8e (patch) | |
tree | e32d9d307b0b14fe62ed74983fbe2530953928d5 | |
parent | 48914b5d5fc12ae96d4f3ac3fc9c6fd08a1d6496 (diff) |
cairo: write document logical structure if output is pdf
Cairo 1.18 can create a tagged pdf. Add support to CairoOutputDev to
copy the logical structure from the input pdf if available.
Added setLogicalStructure() to enable.
Added -struct option to pdftocairo to enable.
-rw-r--r-- | poppler/Annot.h | 2 | ||||
-rw-r--r-- | poppler/CairoOutputDev.cc | 472 | ||||
-rw-r--r-- | poppler/CairoOutputDev.h | 50 | ||||
-rw-r--r-- | poppler/Gfx.cc | 4 | ||||
-rw-r--r-- | poppler/MarkedContentOutputDev.cc | 4 | ||||
-rw-r--r-- | poppler/MarkedContentOutputDev.h | 4 | ||||
-rw-r--r-- | poppler/OutputDev.h | 4 | ||||
-rw-r--r-- | poppler/Page.cc | 9 | ||||
-rw-r--r-- | poppler/Page.h | 5 | ||||
-rw-r--r-- | poppler/StructElement.cc | 9 | ||||
-rw-r--r-- | poppler/StructElement.h | 5 | ||||
-rw-r--r-- | poppler/UTF.cc | 32 | ||||
-rw-r--r-- | poppler/UTF.h | 5 | ||||
-rw-r--r-- | utils/pdftocairo.1 | 4 | ||||
-rw-r--r-- | utils/pdftocairo.cc | 46 |
15 files changed, 635 insertions, 20 deletions
diff --git a/poppler/Annot.h b/poppler/Annot.h index 819877d4..285b43ab 100644 --- a/poppler/Annot.h +++ b/poppler/Annot.h @@ -1757,7 +1757,7 @@ private: // Annots //------------------------------------------------------------------------ -class Annots +class POPPLER_PRIVATE_EXPORT Annots { public: // Build a list of Annot objects and call setPage on them diff --git a/poppler/CairoOutputDev.cc b/poppler/CairoOutputDev.cc index beb92c25..986d7a14 100644 --- a/poppler/CairoOutputDev.cc +++ b/poppler/CairoOutputDev.cc @@ -69,6 +69,7 @@ #include "CairoFontEngine.h" #include "CairoRescaleBox.h" #include "UnicodeMap.h" +#include "UTF.h" #include "JBIG2Stream.h" //------------------------------------------------------------------------ @@ -170,6 +171,9 @@ CairoOutputDev::CairoOutputDev() textPage = nullptr; actualText = nullptr; + logicalStruct = false; + pdfPageNum = 0; + cairoPageNum = 0; // the SA parameter supposedly defaults to false, but Acrobat // apparently hardwires it to true @@ -177,6 +181,7 @@ CairoOutputDev::CairoOutputDev() align_stroke_coords = false; adjusted_stroke_width = false; xref = nullptr; + currentStructParents = -1; } CairoOutputDev::~CairoOutputDev() @@ -232,6 +237,14 @@ void CairoOutputDev::setCairo(cairo_t *c) } } +bool CairoOutputDev::isPDF() +{ + if (cairo) { + return cairo_surface_get_type(cairo_get_target(cairo)) == CAIRO_SURFACE_TYPE_PDF; + } + return false; +} + void CairoOutputDev::setTextPage(TextPage *text) { if (textPage) { @@ -273,10 +286,66 @@ void CairoOutputDev::startDoc(PDFDoc *docA, CairoFontEngine *parentFontEngine) fontEngine_owner = true; } xref = doc->getXRef(); + + mcidEmitted.clear(); + destsMap.clear(); + emittedDestinations.clear(); + pdfPageToCairoPageMap.clear(); + pdfPageRefToCairoPageNumMap.clear(); + cairoPageNum = 0; + firstPage = true; +} + +void CairoOutputDev::textStringToQuotedUtf8(const GooString *text, GooString *s) +{ + std::string utf8 = TextStringToUtf8(text->toStr()); + s->Set("'"); + for (char c : utf8) { + if (c == '\\' || c == '\'') { + s->append("\\"); + } + s->append(c); + } + s->append("'"); +} + +// Initialization that needs to be performed after setCairo() is called. +void CairoOutputDev::startFirstPage(int pageNum, GfxState *state, XRef *xrefA) +{ + if (xrefA) { + xref = xrefA; + } + + if (logicalStruct && isPDF()) { + int numDests = doc->getCatalog()->numDestNameTree(); + for (int i = 0; i < numDests; i++) { + const GooString *name = doc->getCatalog()->getDestNameTreeName(i); + std::unique_ptr<LinkDest> dest = doc->getCatalog()->getDestNameTreeDest(i); + if (dest->isPageRef()) { + Ref ref = dest->getPageRef(); + destsMap[ref].insert({ std::string(name->toStr()), std::move(dest) }); + } + } + + numDests = doc->getCatalog()->numDests(); + for (int i = 0; i < numDests; i++) { + const char *name = doc->getCatalog()->getDestsName(i); + std::unique_ptr<LinkDest> dest = doc->getCatalog()->getDestsDest(i); + if (dest->isPageRef()) { + Ref ref = dest->getPageRef(); + destsMap[ref].insert({ std::string(name), std::move(dest) }); + } + } + } } void CairoOutputDev::startPage(int pageNum, GfxState *state, XRef *xrefA) { + if (firstPage) { + startFirstPage(pageNum, state, xrefA); + firstPage = false; + } + /* set up some per page defaults */ cairo_pattern_destroy(fill_pattern); cairo_pattern_destroy(stroke_pattern); @@ -289,8 +358,52 @@ void CairoOutputDev::startPage(int pageNum, GfxState *state, XRef *xrefA) if (textPage) { textPage->startPage(state); } - if (xrefA != nullptr) { - xref = xrefA; + + pdfPageNum = pageNum; + cairoPageNum++; + pdfPageToCairoPageMap[pdfPageNum] = cairoPageNum; + + if (logicalStruct && isPDF()) { + Object obj = doc->getPage(pageNum)->getAnnotsObject(xref); + Annots *annots = new Annots(doc, pageNum, &obj); + + for (Annot *annot : annots->getAnnots()) { + if (annot->getType() == Annot::typeLink) { + annot->incRefCnt(); + annotations.push_back(annot); + } + } + + delete annots; + + // emit dests + Ref *ref = doc->getCatalog()->getPageRef(pageNum); + pdfPageRefToCairoPageNumMap[*ref] = cairoPageNum; + auto pageDests = destsMap.find(*ref); + if (pageDests != destsMap.end()) { + for (auto &it : pageDests->second) { + GooString quoted_name; + GooString name(it.first); + textStringToQuotedUtf8(&name, "ed_name); + emittedDestinations.insert(quoted_name.toStr()); + + GooString attrib; + attrib.appendf("name={0:t} ", "ed_name); + if (it.second->getChangeLeft()) { + attrib.appendf("x={0:g} ", it.second->getLeft()); + } + if (it.second->getChangeTop()) { + attrib.appendf("y={0:g} ", state->getPageHeight() - it.second->getTop()); + } + +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0) + cairo_tag_begin(cairo, CAIRO_TAG_DEST, attrib.c_str()); + cairo_tag_end(cairo, CAIRO_TAG_DEST); +#endif + } + } + + currentStructParents = doc->getPage(pageNum)->getStructParents(); } } @@ -302,6 +415,309 @@ void CairoOutputDev::endPage() } } +void CairoOutputDev::beginForm(Object *obj, Ref id) +{ + if (logicalStruct && isPDF()) { + structParentsStack.push_back(currentStructParents); + + const Object tmp = obj->streamGetDict()->lookup("StructParents"); + if (!(tmp.isInt() || tmp.isNull())) { + error(errSyntaxError, -1, "XObject StructParents object is wrong type ({0:s})", tmp.getTypeName()); + } else if (tmp.isInt()) { + currentStructParents = tmp.getInt(); + } + } +} + +void CairoOutputDev::endForm(Object *obj, Ref id) +{ + if (logicalStruct && isPDF()) { + currentStructParents = structParentsStack.back(); + structParentsStack.pop_back(); + } +} + +void CairoOutputDev::quadToCairoRect(AnnotQuadrilaterals *quads, int idx, double pageHeight, cairo_rectangle_t *rect) +{ + double x1, x2, y1, y2; + x1 = x2 = quads->getX1(idx); + y1 = y2 = quads->getX2(idx); + + x1 = std::min(x1, quads->getX2(idx)); + x1 = std::min(x1, quads->getX3(idx)); + x1 = std::min(x1, quads->getX4(idx)); + + y1 = std::min(y1, quads->getY2(idx)); + y1 = std::min(y1, quads->getY3(idx)); + y1 = std::min(y1, quads->getY4(idx)); + + x2 = std::max(x2, quads->getX2(idx)); + x2 = std::max(x2, quads->getX3(idx)); + x2 = std::max(x2, quads->getX4(idx)); + + y2 = std::max(y2, quads->getY2(idx)); + y2 = std::max(y2, quads->getY3(idx)); + y2 = std::max(y2, quads->getY4(idx)); + + rect->x = x1; + rect->y = pageHeight - y2; + rect->width = x2 - x1; + rect->height = y2 - y1; +} + +bool CairoOutputDev::appendLinkDestRef(GooString *s, const LinkDest *dest) +{ + Ref ref = dest->getPageRef(); + auto pageNum = pdfPageRefToCairoPageNumMap.find(ref); + if (pageNum != pdfPageRefToCairoPageNumMap.end()) { + auto cairoPage = pdfPageToCairoPageMap.find(pageNum->second); + if (cairoPage != pdfPageToCairoPageMap.end()) { + s->appendf("page={0:d} ", cairoPage->second); + double destPageHeight = doc->getPageMediaHeight(dest->getPageNum()); + appendLinkDestXY(s, dest, destPageHeight); + return true; + } + } + return false; +} + +void CairoOutputDev::appendLinkDestXY(GooString *s, const LinkDest *dest, double destPageHeight) +{ + double x = 0; + double y = 0; + + if (dest->getChangeLeft()) { + x = dest->getLeft(); + } + + if (dest->getChangeTop()) { + y = dest->getTop(); + } + + // if pageHeight is 0, dest is remote document, cairo uses PDF coords in this + // case. So don't flip coords when pageHeight is 0. + s->appendf("pos=[{0:g} {1:g}] ", x, destPageHeight ? destPageHeight - y : y); +} + +bool CairoOutputDev::beginLinkTag(AnnotLink *annotLink) +{ + int page_num = annotLink->getPageNum(); + double height = doc->getPageMediaHeight(page_num); + + GooString attrib; + attrib.appendf("link_page={0:d} ", page_num); + attrib.append("rect=["); + AnnotQuadrilaterals *quads = annotLink->getQuadrilaterals(); + if (quads && quads->getQuadrilateralsLength() > 0) { + for (int i = 0; i < quads->getQuadrilateralsLength(); i++) { + cairo_rectangle_t rect; + quadToCairoRect(quads, i, height, &rect); + attrib.appendf("{0:g} {1:g} {2:g} {3:g} ", rect.x, rect.y, rect.width, rect.height); + } + } else { + double x1, x2, y1, y2; + annotLink->getRect(&x1, &y1, &x2, &y2); + attrib.appendf("{0:g} {1:g} {2:g} {3:g} ", x1, height - y2, x2 - x1, y2 - y1); + } + attrib.append("] "); + + LinkAction *action = annotLink->getAction(); + if (action->getKind() == actionGoTo) { + LinkGoTo *act = static_cast<LinkGoTo *>(action); + if (act->isOk()) { + const GooString *namedDest = act->getNamedDest(); + const LinkDest *linkDest = act->getDest(); + if (namedDest) { + GooString name; + textStringToQuotedUtf8(namedDest, &name); + if (emittedDestinations.count(name.toStr()) == 0) { + return false; + } + attrib.appendf("dest={0:t} ", &name); + } else if (linkDest && linkDest->isOk() && linkDest->isPageRef()) { + bool ok = appendLinkDestRef(&attrib, linkDest); + if (!ok) { + return false; + } + } + } + } else if (action->getKind() == actionGoToR) { + LinkGoToR *act = static_cast<LinkGoToR *>(action); + attrib.appendf("file='{0:t}' ", act->getFileName()); + const GooString *namedDest = act->getNamedDest(); + const LinkDest *linkDest = act->getDest(); + if (namedDest) { + GooString name; + textStringToQuotedUtf8(namedDest, &name); + if (emittedDestinations.count(name.toStr()) == 0) { + return false; + } + attrib.appendf("dest={0:t} ", &name); + } else if (linkDest && linkDest->isOk() && !linkDest->isPageRef()) { + auto cairoPage = pdfPageToCairoPageMap.find(linkDest->getPageNum()); + if (cairoPage != pdfPageToCairoPageMap.end()) { + attrib.appendf("page={0:d} ", cairoPage->second); + appendLinkDestXY(&attrib, linkDest, 0.0); + } else { + return false; + } + } + } else if (action->getKind() == actionURI) { + LinkURI *act = static_cast<LinkURI *>(action); + if (act->isOk()) { + attrib.appendf("uri='{0:s}'", act->getURI().c_str()); + } + } +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0) + cairo_tag_begin(cairo, CAIRO_TAG_LINK, attrib.c_str()); +#endif + return true; +} + +AnnotLink *CairoOutputDev::findLinkObject(const StructElement *elem) +{ + if (elem->isObjectRef()) { + Ref ref = elem->getObjectRef(); + for (Annot *annot : annotations) { + if (annot->getType() == Annot::typeLink && annot->match(&ref)) { + return static_cast<AnnotLink *>(annot); + } + } + } + + for (unsigned i = 0; i < elem->getNumChildren(); i++) { + AnnotLink *link = findLinkObject(elem->getChild(i)); + if (link) { + return link; + } + } + + return nullptr; +} + +bool CairoOutputDev::beginLink(const StructElement *linkElem) +{ + bool emitted = true; + AnnotLink *linkAnnot = findLinkObject(linkElem); + if (linkAnnot) { + emitted = beginLinkTag(linkAnnot); + } else { +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0) + cairo_tag_begin(cairo, linkElem->getTypeName(), nullptr); +#endif + } + return emitted; +} + +void CairoOutputDev::getStructElemAttributeString(const StructElement *elem) +{ + int mcid = 0; + GooString attribs; + Ref ref = elem->getObjectRef(); + attribs.appendf("id='{0:d}_{1:d}_{2:d}'", ref.num, ref.gen, mcid); + attribs.appendf(" parent='{0:d}_{1:d}'", ref.num, ref.gen); +} + +int CairoOutputDev::getContentElementStructParents(const StructElement *element) +{ + int structParents = -1; + Ref ref; + + if (element->hasStmRef()) { + element->getStmRef(ref); + Object xobjectObj = xref->fetch(ref); + const Object &spObj = xobjectObj.streamGetDict()->lookup("StructParents"); + if (spObj.isInt()) { + structParents = spObj.getInt(); + } + } else if (element->hasPageRef()) { + element->getPageRef(ref); + Object pageObj = xref->fetch(ref); + const Object &spObj = pageObj.dictLookup("StructParents"); + if (spObj.isInt()) { + structParents = spObj.getInt(); + } + } + + if (structParents == -1) { + error(errSyntaxError, -1, "Unable to find StructParents object for StructElement"); + } + return structParents; +} + +bool CairoOutputDev::checkIfStructElementNeeded(const StructElement *element) +{ + if (element->isContent() && !element->isObjectRef()) { + int structParents = getContentElementStructParents(element); + int mcid = element->getMCID(); + if (mcidEmitted.count(std::pair(structParents, mcid)) > 0) { + structElementNeeded.insert(element); + return true; + } + } else if (!element->isContent()) { + bool needed = false; + for (unsigned i = 0; i < element->getNumChildren(); i++) { + if (checkIfStructElementNeeded(element->getChild(i))) { + needed = true; + } + } + if (needed) { + structElementNeeded.insert(element); + } + return needed; + } + return false; +} + +void CairoOutputDev::emitStructElement(const StructElement *element) +{ + if (structElementNeeded.count(element) == 0) { + return; + } + +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0) + if (element->isContent() && !element->isObjectRef()) { + int structParents = getContentElementStructParents(element); + int mcid = element->getMCID(); + GooString attribs; + attribs.appendf("ref='{0:d}_{1:d}'", structParents, mcid); + cairo_tag_begin(cairo, CAIRO_TAG_CONTENT_REF, attribs.c_str()); + cairo_tag_end(cairo, CAIRO_TAG_CONTENT_REF); + } else if (!element->isContent()) { + if (element->getType() == StructElement::Link) { + bool ok = beginLink(element); + if (!ok) { + return; + } + } else { + cairo_tag_begin(cairo, element->getTypeName(), ""); + } + for (unsigned i = 0; i < element->getNumChildren(); i++) { + emitStructElement(element->getChild(i)); + } + cairo_tag_end(cairo, element->getTypeName()); + } +#endif +} + +void CairoOutputDev::emitStructTree() +{ + if (logicalStruct && isPDF()) { + const StructTreeRoot *root = doc->getStructTreeRoot(); + if (!root) { + return; + } + + for (unsigned i = 0; i < root->getNumChildren(); i++) { + checkIfStructElementNeeded(root->getChild(i)); + } + + for (unsigned i = 0; i < root->getNumChildren(); i++) { + emitStructElement(root->getChild(i)); + } + } +} + void CairoOutputDev::startType3Render(GfxState *state, XRef *xrefA) { /* When cairo calls a user font render function, the default @@ -3468,6 +3884,58 @@ void CairoOutputDev::drawImage(GfxState *state, Object *ref, Stream *str, int wi cairo_pattern_destroy(pattern); } +void CairoOutputDev::beginMarkedContent(const char *name, Dict *properties) +{ + if (!logicalStruct || !isPDF()) { + return; + } + + if (strcmp(name, "Artifact") == 0) { + markedContentStack.emplace_back(name); +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0) + cairo_tag_begin(cairo, name, nullptr); +#endif + return; + } + + int mcid = -1; + if (properties) { + properties->lookupInt("MCID", nullptr, &mcid); + } + + if (mcid == -1) { + return; + } + + GooString attribs; + attribs.appendf("tag_name='{0:s}' id='{1:d}_{2:d}'", name, currentStructParents, mcid); + mcidEmitted.insert(std::pair<int, int>(currentStructParents, mcid)); + + std::string tag; +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0) + tag = CAIRO_TAG_CONTENT; + cairo_tag_begin(cairo, CAIRO_TAG_CONTENT, attribs.c_str()); +#endif + + markedContentStack.push_back(tag); +} + +void CairoOutputDev::endMarkedContent(GfxState *state) +{ + if (!logicalStruct || !isPDF()) { + return; + } + + if (markedContentStack.size() == 0) { + return; + } + +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0) + cairo_tag_end(cairo, markedContentStack.back().c_str()); +#endif + markedContentStack.pop_back(); +} + //------------------------------------------------------------------------ // ImageOutputDev //------------------------------------------------------------------------ diff --git a/poppler/CairoOutputDev.h b/poppler/CairoOutputDev.h index c4c5f8d0..a955115e 100644 --- a/poppler/CairoOutputDev.h +++ b/poppler/CairoOutputDev.h @@ -37,10 +37,16 @@ #ifndef CAIROOUTPUTDEV_H #define CAIROOUTPUTDEV_H +#include <unordered_set> + #include <cairo-ft.h> #include "OutputDev.h" #include "TextOutputDev.h" #include "GfxState.h" +#include "StructElement.h" +#include "StructTreeRoot.h" +#include "Annot.h" +#include "Link.h" class PDFDoc; class GfxState; @@ -143,6 +149,12 @@ public: // End a page. void endPage() override; + // Must be called before last call to endPage() + void emitStructTree(); + + void beginForm(Object *obj, Ref id) override; + void endForm(Object *obj, Ref id) override; + //----- save/restore graphics state void saveState(GfxState *state) override; void restoreState(GfxState *state) override; @@ -201,6 +213,9 @@ public: void beginTextObject(GfxState *state) override; void endTextObject(GfxState *state) override; + void beginMarkedContent(const char *name, Dict *properties) override; + void endMarkedContent(GfxState *state) override; + //----- image drawing void drawImageMask(GfxState *state, Object *ref, Stream *str, int width, int height, bool invert, bool interpolate, bool inlineImg) override; void setSoftMaskFromImageMask(GfxState *state, Object *ref, Stream *str, int width, int height, bool invert, bool inlineImg, double *baseMatrix) override; @@ -244,6 +259,7 @@ public: needFontUpdate = true; } void copyAntialias(cairo_t *cr, cairo_t *source_cr); + void setLogicalStructure(bool logStruct) { this->logicalStruct = logStruct; } enum Type3RenderType { @@ -270,12 +286,25 @@ protected: void setMimeData(GfxState *state, Stream *str, Object *ref, GfxImageColorMap *colorMap, cairo_surface_t *image, int height); void fillToStrokePathClip(GfxState *state); void alignStrokeCoords(const GfxSubpath *subpath, int i, double *x, double *y); + AnnotLink *findLinkObject(const StructElement *elem); + void quadToCairoRect(AnnotQuadrilaterals *quads, int idx, double destPageHeight, cairo_rectangle_t *rect); + bool appendLinkDestRef(GooString *s, const LinkDest *dest); + void appendLinkDestXY(GooString *s, const LinkDest *dest, double destPageHeight); + bool beginLinkTag(AnnotLink *annotLink); + bool beginLink(const StructElement *linkElem); + void getStructElemAttributeString(const StructElement *elem); + int getContentElementStructParents(const StructElement *element); + bool checkIfStructElementNeeded(const StructElement *element); + void emitStructElement(const StructElement *elem); + void startFirstPage(int pageNum, GfxState *state, XRef *xrefA); #if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 14, 0) bool setMimeDataForJBIG2Globals(Stream *str, cairo_surface_t *image); #endif #if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 15, 10) bool setMimeDataForCCITTParams(Stream *str, cairo_surface_t *image, int height); #endif + static void textStringToQuotedUtf8(const GooString *text, GooString *s); + bool isPDF(); std::optional<GfxRGB> fill_color, stroke_color; cairo_pattern_t *fill_pattern, *stroke_pattern; @@ -331,6 +360,14 @@ protected: bool has_color; double t3_glyph_bbox[4]; bool prescaleImages; + bool logicalStruct; + bool firstPage; + int pdfPageNum; // page number of the PDF file + int cairoPageNum; // page number in cairo output + std::vector<std::string> markedContentStack; + std::vector<Annot *> annotations; + std::set<std::string> emittedDestinations; + std::map<int, int> pdfPageToCairoPageMap; TextPage *textPage; // text for the current page ActualText *actualText; @@ -361,6 +398,19 @@ protected: Ref fontRef; }; std::vector<SaveStateElement> saveStateStack; + + std::map<Ref, std::map<std::string, std::unique_ptr<LinkDest>>> destsMap; + std::map<Ref, int> pdfPageRefToCairoPageNumMap; + std::vector<int> structParentsStack; + int currentStructParents; + + struct StructParentsMcidHash + { + size_t operator()(std::pair<int, int> x) const { return x.first << 16 | x.second; } + }; + std::unordered_set<std::pair<int, int>, StructParentsMcidHash> mcidEmitted; // <structParent, MCID> + + std::unordered_set<const StructElement *> structElementNeeded; }; //------------------------------------------------------------------------ diff --git a/poppler/Gfx.cc b/poppler/Gfx.cc index 00662256..3fa56aa4 100644 --- a/poppler/Gfx.cc +++ b/poppler/Gfx.cc @@ -4135,9 +4135,9 @@ void Gfx::opXObject(Object args[], int numArgs) out->drawForm(refObj.getRef()); } else { Ref ref = refObj.isRef() ? refObj.getRef() : Ref::INVALID(); - out->beginForm(ref); + out->beginForm(&obj1, ref); doForm(&obj1); - out->endForm(ref); + out->endForm(&obj1, ref); } } if (refObj.isRef() && shouldDoForm) { diff --git a/poppler/MarkedContentOutputDev.cc b/poppler/MarkedContentOutputDev.cc index e3a4cdce..63b17446 100644 --- a/poppler/MarkedContentOutputDev.cc +++ b/poppler/MarkedContentOutputDev.cc @@ -56,12 +56,12 @@ void MarkedContentOutputDev::endPage() pageWidth = pageHeight = 0.0; } -void MarkedContentOutputDev::beginForm(Ref id) +void MarkedContentOutputDev::beginForm(Object * /* obj */, Ref id) { formStack.push_back(id); } -void MarkedContentOutputDev::endForm(Ref id) +void MarkedContentOutputDev::endForm(Object * /* obj */, Ref id) { formStack.pop_back(); } diff --git a/poppler/MarkedContentOutputDev.h b/poppler/MarkedContentOutputDev.h index dad39a77..d6584b70 100644 --- a/poppler/MarkedContentOutputDev.h +++ b/poppler/MarkedContentOutputDev.h @@ -100,8 +100,8 @@ public: void startPage(int pageNum, GfxState *state, XRef *xref) override; void endPage() override; - void beginForm(Ref id) override; - void endForm(Ref id) override; + void beginForm(Object * /* obj */, Ref id) override; + void endForm(Object * /* obj */, Ref id) override; void drawChar(GfxState *state, double xx, double yy, double dx, double dy, double ox, double oy, CharCode c, int nBytes, const Unicode *u, int uLen) override; diff --git a/poppler/OutputDev.h b/poppler/OutputDev.h index 67c21618..02438ab0 100644 --- a/poppler/OutputDev.h +++ b/poppler/OutputDev.h @@ -320,9 +320,9 @@ public: virtual void type3D1(GfxState * /*state*/, double /*wx*/, double /*wy*/, double /*llx*/, double /*lly*/, double /*urx*/, double /*ury*/) { } //----- form XObjects - virtual void beginForm(Ref /*id*/) { } + virtual void beginForm(Object * /* obj */, Ref /*id*/) { } virtual void drawForm(Ref /*id*/) { } - virtual void endForm(Ref /*id*/) { } + virtual void endForm(Object * /* obj */, Ref /*id*/) { } //----- PostScript XObjects virtual void psXObject(Stream * /*psStream*/, Stream * /*level1Stream*/) { } diff --git a/poppler/Page.cc b/poppler/Page.cc index 9d5a4ffb..99639f3d 100644 --- a/poppler/Page.cc +++ b/poppler/Page.cc @@ -259,6 +259,7 @@ Page::Page(PDFDoc *docA, int numA, Object &&pageDict, Ref pageRefA, PageAttrs *a num = numA; duration = -1; annots = nullptr; + structParents = -1; pageObj = std::move(pageDict); @@ -281,6 +282,14 @@ Page::Page(PDFDoc *docA, int numA, Object &&pageDict, Ref pageRefA, PageAttrs *a duration = tmp.getNum(); } + // structParents + const Object &tmp2 = pageObj.dictLookup("StructParents"); + if (!(tmp2.isInt() || tmp2.isNull())) { + error(errSyntaxError, -1, "Page StructParents object (page {0:d}) is wrong type ({1:s})", num, tmp2.getTypeName()); + } else if (tmp2.isInt()) { + structParents = tmp2.getInt(); + } + // annotations annotsObj = pageObj.dictLookupNF("Annots").copy(); if (!(annotsObj.isRef() || annotsObj.isArray() || annotsObj.isNull())) { diff --git a/poppler/Page.h b/poppler/Page.h index 3fe86be7..fdd4c260 100644 --- a/poppler/Page.h +++ b/poppler/Page.h @@ -242,6 +242,10 @@ public: bool hasStandaloneFields() const { return !standaloneFields.empty(); } + // Get the integer key of the page's entry in the structural parent tree. + // Returns -1 if the page dict does not contain a StructParents key. + int getStructParents() const { return structParents; } + private: // replace xref void replaceXRef(XRef *xrefA); @@ -259,6 +263,7 @@ private: Object trans; // page transition Object actions; // page additional actions double duration; // page duration + int structParents; // integer key of page in structure parent tree bool ok; // true if page is valid mutable std::recursive_mutex mutex; // standalone widgets are special FormWidget's inside a Page that *are not* diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc index 9b722c19..35babbba 100644 --- a/poppler/StructElement.cc +++ b/poppler/StructElement.cc @@ -779,6 +779,15 @@ bool StructElement::getPageRef(Ref &ref) const return false; } +bool StructElement::getStmRef(Ref &ref) const +{ + if (stmRef.isRef()) { + ref = stmRef.getRef(); + return true; + } + return false; +} + const char *StructElement::getTypeName() const { return typeToName(type); diff --git a/poppler/StructElement.h b/poppler/StructElement.h index 2244fbe2..d083735d 100644 --- a/poppler/StructElement.h +++ b/poppler/StructElement.h @@ -242,9 +242,12 @@ public: int getMCID() const { return c->mcid; } Ref getObjectRef() const { return c->ref; } - Ref getParentRef() { return isContent() ? parent->getParentRef() : s->parentRef; } + Ref getParentRef() const { return isContent() ? parent->getParentRef() : s->parentRef; } + StructElement *getParent() const { return parent; } // returns NULL if parent is StructTreeRoot bool hasPageRef() const; bool getPageRef(Ref &ref) const; + bool hasStmRef() const { return stmRef.isRef(); } + bool getStmRef(Ref &ref) const; StructTreeRoot *getStructTreeRoot() { return treeRoot; } // Optional element identifier. diff --git a/poppler/UTF.cc b/poppler/UTF.cc index cce313ef..9216e61f 100644 --- a/poppler/UTF.cc +++ b/poppler/UTF.cc @@ -561,3 +561,35 @@ void unicodeToAscii7(const Unicode *in, int len, Unicode **ucs4_out, int *out_le *indices = idx; } } + +// Convert a PDF Text String to UTF-8 +// textStr - PDF text string +// returns UTF-8 string. +std::string TextStringToUtf8(const std::string &textStr) +{ + int i, len; + const char *s; + char *utf8; + + len = textStr.size(); + s = textStr.c_str(); + if (GooString::hasUnicodeMarker(textStr)) { + uint16_t *utf16; + len = len / 2 - 1; + utf16 = new uint16_t[len]; + for (i = 0; i < len; i++) { + utf16[i] = (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff); + } + utf8 = utf16ToUtf8(utf16, &len); + delete[] utf16; + } else { + utf8 = (char *)gmalloc(len + 1); + for (i = 0; i < len; i++) { + utf8[i] = pdfDocEncoding[s[i] & 0xff]; + } + utf8[i] = 0; + } + std::string utf8_string(utf8); + gfree(utf8); + return utf8_string; +} diff --git a/poppler/UTF.h b/poppler/UTF.h index bd04d05e..598c2e45 100644 --- a/poppler/UTF.h +++ b/poppler/UTF.h @@ -125,4 +125,9 @@ char POPPLER_PRIVATE_EXPORT *utf16ToUtf8(const uint16_t *utf16, int *len = nullp // being passed in @in_idx parameter). void POPPLER_PRIVATE_EXPORT unicodeToAscii7(const Unicode *in, int len, Unicode **ucs4_out, int *out_len, const int *in_idx, int **indices); +// Convert a PDF Text String to UTF-8 +// textStr - PDF text string +// returns UTF-8 string. +std::string POPPLER_PRIVATE_EXPORT TextStringToUtf8(const std::string &textStr); + #endif diff --git a/utils/pdftocairo.1 b/utils/pdftocairo.1 index 8de23925..782f81fd 100644 --- a/utils/pdftocairo.1 +++ b/utils/pdftocairo.1 @@ -211,6 +211,10 @@ Generate Level 2 PostScript (PS only). Generate Level 3 PostScript (PS only). This enables all Level 2 features plus shading patterns and masked images. This is the default setting. .TP +.B \-struct +If the input file contains structural information about the document's content, +write this information to the output file (PDF only). +.TP .B \-origpagesizes This option is the same as "\-paper match". .TP diff --git a/utils/pdftocairo.cc b/utils/pdftocairo.cc index faa5a02d..f16f03e4 100644 --- a/utils/pdftocairo.cc +++ b/utils/pdftocairo.cc @@ -132,6 +132,7 @@ static bool noShrink = false; static bool noCenter = false; static bool duplex = false; static char tiffCompressionStr[16] = ""; +static bool docStruct = false; static char ownerPassword[33] = ""; static char userPassword[33] = ""; @@ -220,6 +221,10 @@ static const ArgDesc argDesc[] = { { "-nocenter", argFlag, &noCenter, 0, "don't center pages smaller than the paper size" }, { "-duplex", argFlag, &duplex, 0, "enable duplex printing" }, +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0) + { "-struct", argFlag, &docStruct, 0, "enable logical document structure" }, +#endif + { "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" }, { "-upw", argString, userPassword, sizeof(userPassword), "user password (for encrypted files)" }, @@ -725,11 +730,25 @@ static void renderPage(PDFDoc *doc, CairoOutputDev *cairoOut, int pg, double pag cairo_destroy(cr); } -static void endPage(GooString *imageFileName) +static void endPage(GooString *imageFileName, CairoOutputDev *cairoOut, bool isLastPage) { cairo_status_t status; + cairo_t *cr; if (printing) { + if (isLastPage) { + cr = cairo_create(surface); + cairoOut->setCairo(cr); + cairoOut->setPrinting(printing); + cairoOut->emitStructTree(); + cairoOut->setCairo(nullptr); + status = cairo_status(cr); + if (status) { + fprintf(stderr, "cairo error: %s\n", cairo_status_to_string(status)); + } + cairo_destroy(cr); + } + cairo_surface_show_page(surface); #ifdef CAIRO_HAS_WIN32_SURFACE @@ -907,7 +926,6 @@ int main(int argc, char *argv[]) int pg, pg_num_len; double pg_w, pg_h, tmp, output_w, output_h; int num_outputs; - bool documentInitialized = false; // parse args Win32Console win32Console(&argc, &argv); @@ -1026,6 +1044,10 @@ int main(int argc, char *argv[]) level3 = true; } + if (docStruct && !pdf) { + fprintf(stderr, "Error: -struct may only be used with pdf or output.\n"); + exit(99); + } if (eps && (origPageSizes || paperSize[0] || paperWidth > 0 || paperHeight > 0)) { fprintf(stderr, "Error: page size options may not be used with eps output.\n"); exit(99); @@ -1145,8 +1167,15 @@ int main(int argc, char *argv[]) // If our page range selection and document size indicate we're only // outputting a single page, ensure that even/odd page selection doesn't - // filter out that single page. - if (firstPage == lastPage && ((printOnlyEven && firstPage % 2 == 1) || (printOnlyOdd && firstPage % 2 == 0))) { + // filter out that single page. Also adjust first and last page so there are no pages + // skipped at the start or end of the for loop. + if ((printOnlyEven && firstPage % 2 == 1) || (printOnlyOdd && firstPage % 2 == 0)) { + firstPage++; + } + if ((printOnlyEven && lastPage % 2 == 1) || (printOnlyOdd && lastPage % 2 == 0)) { + lastPage--; + } + if (lastPage < firstPage) { fprintf(stderr, "Invalid even/odd page selection, no pages match criteria.\n"); exit(99); } @@ -1174,6 +1203,8 @@ int main(int argc, char *argv[]) #endif cairoOut = new CairoOutputDev(); + cairoOut->setLogicalStructure(docStruct); + #ifdef USE_CMS cairoOut->setDisplayProfile(profile); #endif @@ -1197,7 +1228,7 @@ int main(int argc, char *argv[]) pg_h = doc->getPageMediaHeight(pg); } - if (printing && !documentInitialized) { + if (printing && pg == firstPage) { if (paperWidth < 0 || paperHeight < 0) { paperWidth = (int)ceil(pg_w); paperHeight = (int)ceil(pg_h); @@ -1235,13 +1266,12 @@ int main(int argc, char *argv[]) } getOutputSize(pg_w, pg_h, &output_w, &output_h); - if (!documentInitialized) { + if (pg == firstPage) { beginDocument(fileName, outputFileName, output_w, output_h); - documentInitialized = true; } beginPage(&output_w, &output_h); renderPage(doc.get(), cairoOut, pg, pg_w, pg_h, output_w, output_h); - endPage(imageFileName); + endPage(imageFileName, cairoOut, pg == lastPage); } endDocument(); |