summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdrian Johnson <ajohnson@redneon.com>2023-10-02 12:25:22 +1030
committerAdrian Johnson <ajohnson@redneon.com>2023-10-08 21:41:17 +1030
commitb4052d02b44e8e412316a8dca1a99f9714e5aa8e (patch)
treee32d9d307b0b14fe62ed74983fbe2530953928d5
parent48914b5d5fc12ae96d4f3ac3fc9c6fd08a1d6496 (diff)
cairo: write document logical structure if output is pdf
Cairo 1.18 can create a tagged pdf. Add support to CairoOutputDev to copy the logical structure from the input pdf if available. Added setLogicalStructure() to enable. Added -struct option to pdftocairo to enable.
-rw-r--r--poppler/Annot.h2
-rw-r--r--poppler/CairoOutputDev.cc472
-rw-r--r--poppler/CairoOutputDev.h50
-rw-r--r--poppler/Gfx.cc4
-rw-r--r--poppler/MarkedContentOutputDev.cc4
-rw-r--r--poppler/MarkedContentOutputDev.h4
-rw-r--r--poppler/OutputDev.h4
-rw-r--r--poppler/Page.cc9
-rw-r--r--poppler/Page.h5
-rw-r--r--poppler/StructElement.cc9
-rw-r--r--poppler/StructElement.h5
-rw-r--r--poppler/UTF.cc32
-rw-r--r--poppler/UTF.h5
-rw-r--r--utils/pdftocairo.14
-rw-r--r--utils/pdftocairo.cc46
15 files changed, 635 insertions, 20 deletions
diff --git a/poppler/Annot.h b/poppler/Annot.h
index 819877d4..285b43ab 100644
--- a/poppler/Annot.h
+++ b/poppler/Annot.h
@@ -1757,7 +1757,7 @@ private:
// Annots
//------------------------------------------------------------------------
-class Annots
+class POPPLER_PRIVATE_EXPORT Annots
{
public:
// Build a list of Annot objects and call setPage on them
diff --git a/poppler/CairoOutputDev.cc b/poppler/CairoOutputDev.cc
index beb92c25..986d7a14 100644
--- a/poppler/CairoOutputDev.cc
+++ b/poppler/CairoOutputDev.cc
@@ -69,6 +69,7 @@
#include "CairoFontEngine.h"
#include "CairoRescaleBox.h"
#include "UnicodeMap.h"
+#include "UTF.h"
#include "JBIG2Stream.h"
//------------------------------------------------------------------------
@@ -170,6 +171,9 @@ CairoOutputDev::CairoOutputDev()
textPage = nullptr;
actualText = nullptr;
+ logicalStruct = false;
+ pdfPageNum = 0;
+ cairoPageNum = 0;
// the SA parameter supposedly defaults to false, but Acrobat
// apparently hardwires it to true
@@ -177,6 +181,7 @@ CairoOutputDev::CairoOutputDev()
align_stroke_coords = false;
adjusted_stroke_width = false;
xref = nullptr;
+ currentStructParents = -1;
}
CairoOutputDev::~CairoOutputDev()
@@ -232,6 +237,14 @@ void CairoOutputDev::setCairo(cairo_t *c)
}
}
+bool CairoOutputDev::isPDF()
+{
+ if (cairo) {
+ return cairo_surface_get_type(cairo_get_target(cairo)) == CAIRO_SURFACE_TYPE_PDF;
+ }
+ return false;
+}
+
void CairoOutputDev::setTextPage(TextPage *text)
{
if (textPage) {
@@ -273,10 +286,66 @@ void CairoOutputDev::startDoc(PDFDoc *docA, CairoFontEngine *parentFontEngine)
fontEngine_owner = true;
}
xref = doc->getXRef();
+
+ mcidEmitted.clear();
+ destsMap.clear();
+ emittedDestinations.clear();
+ pdfPageToCairoPageMap.clear();
+ pdfPageRefToCairoPageNumMap.clear();
+ cairoPageNum = 0;
+ firstPage = true;
+}
+
+void CairoOutputDev::textStringToQuotedUtf8(const GooString *text, GooString *s)
+{
+ std::string utf8 = TextStringToUtf8(text->toStr());
+ s->Set("'");
+ for (char c : utf8) {
+ if (c == '\\' || c == '\'') {
+ s->append("\\");
+ }
+ s->append(c);
+ }
+ s->append("'");
+}
+
+// Initialization that needs to be performed after setCairo() is called.
+void CairoOutputDev::startFirstPage(int pageNum, GfxState *state, XRef *xrefA)
+{
+ if (xrefA) {
+ xref = xrefA;
+ }
+
+ if (logicalStruct && isPDF()) {
+ int numDests = doc->getCatalog()->numDestNameTree();
+ for (int i = 0; i < numDests; i++) {
+ const GooString *name = doc->getCatalog()->getDestNameTreeName(i);
+ std::unique_ptr<LinkDest> dest = doc->getCatalog()->getDestNameTreeDest(i);
+ if (dest->isPageRef()) {
+ Ref ref = dest->getPageRef();
+ destsMap[ref].insert({ std::string(name->toStr()), std::move(dest) });
+ }
+ }
+
+ numDests = doc->getCatalog()->numDests();
+ for (int i = 0; i < numDests; i++) {
+ const char *name = doc->getCatalog()->getDestsName(i);
+ std::unique_ptr<LinkDest> dest = doc->getCatalog()->getDestsDest(i);
+ if (dest->isPageRef()) {
+ Ref ref = dest->getPageRef();
+ destsMap[ref].insert({ std::string(name), std::move(dest) });
+ }
+ }
+ }
}
void CairoOutputDev::startPage(int pageNum, GfxState *state, XRef *xrefA)
{
+ if (firstPage) {
+ startFirstPage(pageNum, state, xrefA);
+ firstPage = false;
+ }
+
/* set up some per page defaults */
cairo_pattern_destroy(fill_pattern);
cairo_pattern_destroy(stroke_pattern);
@@ -289,8 +358,52 @@ void CairoOutputDev::startPage(int pageNum, GfxState *state, XRef *xrefA)
if (textPage) {
textPage->startPage(state);
}
- if (xrefA != nullptr) {
- xref = xrefA;
+
+ pdfPageNum = pageNum;
+ cairoPageNum++;
+ pdfPageToCairoPageMap[pdfPageNum] = cairoPageNum;
+
+ if (logicalStruct && isPDF()) {
+ Object obj = doc->getPage(pageNum)->getAnnotsObject(xref);
+ Annots *annots = new Annots(doc, pageNum, &obj);
+
+ for (Annot *annot : annots->getAnnots()) {
+ if (annot->getType() == Annot::typeLink) {
+ annot->incRefCnt();
+ annotations.push_back(annot);
+ }
+ }
+
+ delete annots;
+
+ // emit dests
+ Ref *ref = doc->getCatalog()->getPageRef(pageNum);
+ pdfPageRefToCairoPageNumMap[*ref] = cairoPageNum;
+ auto pageDests = destsMap.find(*ref);
+ if (pageDests != destsMap.end()) {
+ for (auto &it : pageDests->second) {
+ GooString quoted_name;
+ GooString name(it.first);
+ textStringToQuotedUtf8(&name, &quoted_name);
+ emittedDestinations.insert(quoted_name.toStr());
+
+ GooString attrib;
+ attrib.appendf("name={0:t} ", &quoted_name);
+ if (it.second->getChangeLeft()) {
+ attrib.appendf("x={0:g} ", it.second->getLeft());
+ }
+ if (it.second->getChangeTop()) {
+ attrib.appendf("y={0:g} ", state->getPageHeight() - it.second->getTop());
+ }
+
+#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0)
+ cairo_tag_begin(cairo, CAIRO_TAG_DEST, attrib.c_str());
+ cairo_tag_end(cairo, CAIRO_TAG_DEST);
+#endif
+ }
+ }
+
+ currentStructParents = doc->getPage(pageNum)->getStructParents();
}
}
@@ -302,6 +415,309 @@ void CairoOutputDev::endPage()
}
}
+void CairoOutputDev::beginForm(Object *obj, Ref id)
+{
+ if (logicalStruct && isPDF()) {
+ structParentsStack.push_back(currentStructParents);
+
+ const Object tmp = obj->streamGetDict()->lookup("StructParents");
+ if (!(tmp.isInt() || tmp.isNull())) {
+ error(errSyntaxError, -1, "XObject StructParents object is wrong type ({0:s})", tmp.getTypeName());
+ } else if (tmp.isInt()) {
+ currentStructParents = tmp.getInt();
+ }
+ }
+}
+
+void CairoOutputDev::endForm(Object *obj, Ref id)
+{
+ if (logicalStruct && isPDF()) {
+ currentStructParents = structParentsStack.back();
+ structParentsStack.pop_back();
+ }
+}
+
+void CairoOutputDev::quadToCairoRect(AnnotQuadrilaterals *quads, int idx, double pageHeight, cairo_rectangle_t *rect)
+{
+ double x1, x2, y1, y2;
+ x1 = x2 = quads->getX1(idx);
+ y1 = y2 = quads->getX2(idx);
+
+ x1 = std::min(x1, quads->getX2(idx));
+ x1 = std::min(x1, quads->getX3(idx));
+ x1 = std::min(x1, quads->getX4(idx));
+
+ y1 = std::min(y1, quads->getY2(idx));
+ y1 = std::min(y1, quads->getY3(idx));
+ y1 = std::min(y1, quads->getY4(idx));
+
+ x2 = std::max(x2, quads->getX2(idx));
+ x2 = std::max(x2, quads->getX3(idx));
+ x2 = std::max(x2, quads->getX4(idx));
+
+ y2 = std::max(y2, quads->getY2(idx));
+ y2 = std::max(y2, quads->getY3(idx));
+ y2 = std::max(y2, quads->getY4(idx));
+
+ rect->x = x1;
+ rect->y = pageHeight - y2;
+ rect->width = x2 - x1;
+ rect->height = y2 - y1;
+}
+
+bool CairoOutputDev::appendLinkDestRef(GooString *s, const LinkDest *dest)
+{
+ Ref ref = dest->getPageRef();
+ auto pageNum = pdfPageRefToCairoPageNumMap.find(ref);
+ if (pageNum != pdfPageRefToCairoPageNumMap.end()) {
+ auto cairoPage = pdfPageToCairoPageMap.find(pageNum->second);
+ if (cairoPage != pdfPageToCairoPageMap.end()) {
+ s->appendf("page={0:d} ", cairoPage->second);
+ double destPageHeight = doc->getPageMediaHeight(dest->getPageNum());
+ appendLinkDestXY(s, dest, destPageHeight);
+ return true;
+ }
+ }
+ return false;
+}
+
+void CairoOutputDev::appendLinkDestXY(GooString *s, const LinkDest *dest, double destPageHeight)
+{
+ double x = 0;
+ double y = 0;
+
+ if (dest->getChangeLeft()) {
+ x = dest->getLeft();
+ }
+
+ if (dest->getChangeTop()) {
+ y = dest->getTop();
+ }
+
+ // if pageHeight is 0, dest is remote document, cairo uses PDF coords in this
+ // case. So don't flip coords when pageHeight is 0.
+ s->appendf("pos=[{0:g} {1:g}] ", x, destPageHeight ? destPageHeight - y : y);
+}
+
+bool CairoOutputDev::beginLinkTag(AnnotLink *annotLink)
+{
+ int page_num = annotLink->getPageNum();
+ double height = doc->getPageMediaHeight(page_num);
+
+ GooString attrib;
+ attrib.appendf("link_page={0:d} ", page_num);
+ attrib.append("rect=[");
+ AnnotQuadrilaterals *quads = annotLink->getQuadrilaterals();
+ if (quads && quads->getQuadrilateralsLength() > 0) {
+ for (int i = 0; i < quads->getQuadrilateralsLength(); i++) {
+ cairo_rectangle_t rect;
+ quadToCairoRect(quads, i, height, &rect);
+ attrib.appendf("{0:g} {1:g} {2:g} {3:g} ", rect.x, rect.y, rect.width, rect.height);
+ }
+ } else {
+ double x1, x2, y1, y2;
+ annotLink->getRect(&x1, &y1, &x2, &y2);
+ attrib.appendf("{0:g} {1:g} {2:g} {3:g} ", x1, height - y2, x2 - x1, y2 - y1);
+ }
+ attrib.append("] ");
+
+ LinkAction *action = annotLink->getAction();
+ if (action->getKind() == actionGoTo) {
+ LinkGoTo *act = static_cast<LinkGoTo *>(action);
+ if (act->isOk()) {
+ const GooString *namedDest = act->getNamedDest();
+ const LinkDest *linkDest = act->getDest();
+ if (namedDest) {
+ GooString name;
+ textStringToQuotedUtf8(namedDest, &name);
+ if (emittedDestinations.count(name.toStr()) == 0) {
+ return false;
+ }
+ attrib.appendf("dest={0:t} ", &name);
+ } else if (linkDest && linkDest->isOk() && linkDest->isPageRef()) {
+ bool ok = appendLinkDestRef(&attrib, linkDest);
+ if (!ok) {
+ return false;
+ }
+ }
+ }
+ } else if (action->getKind() == actionGoToR) {
+ LinkGoToR *act = static_cast<LinkGoToR *>(action);
+ attrib.appendf("file='{0:t}' ", act->getFileName());
+ const GooString *namedDest = act->getNamedDest();
+ const LinkDest *linkDest = act->getDest();
+ if (namedDest) {
+ GooString name;
+ textStringToQuotedUtf8(namedDest, &name);
+ if (emittedDestinations.count(name.toStr()) == 0) {
+ return false;
+ }
+ attrib.appendf("dest={0:t} ", &name);
+ } else if (linkDest && linkDest->isOk() && !linkDest->isPageRef()) {
+ auto cairoPage = pdfPageToCairoPageMap.find(linkDest->getPageNum());
+ if (cairoPage != pdfPageToCairoPageMap.end()) {
+ attrib.appendf("page={0:d} ", cairoPage->second);
+ appendLinkDestXY(&attrib, linkDest, 0.0);
+ } else {
+ return false;
+ }
+ }
+ } else if (action->getKind() == actionURI) {
+ LinkURI *act = static_cast<LinkURI *>(action);
+ if (act->isOk()) {
+ attrib.appendf("uri='{0:s}'", act->getURI().c_str());
+ }
+ }
+#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0)
+ cairo_tag_begin(cairo, CAIRO_TAG_LINK, attrib.c_str());
+#endif
+ return true;
+}
+
+AnnotLink *CairoOutputDev::findLinkObject(const StructElement *elem)
+{
+ if (elem->isObjectRef()) {
+ Ref ref = elem->getObjectRef();
+ for (Annot *annot : annotations) {
+ if (annot->getType() == Annot::typeLink && annot->match(&ref)) {
+ return static_cast<AnnotLink *>(annot);
+ }
+ }
+ }
+
+ for (unsigned i = 0; i < elem->getNumChildren(); i++) {
+ AnnotLink *link = findLinkObject(elem->getChild(i));
+ if (link) {
+ return link;
+ }
+ }
+
+ return nullptr;
+}
+
+bool CairoOutputDev::beginLink(const StructElement *linkElem)
+{
+ bool emitted = true;
+ AnnotLink *linkAnnot = findLinkObject(linkElem);
+ if (linkAnnot) {
+ emitted = beginLinkTag(linkAnnot);
+ } else {
+#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0)
+ cairo_tag_begin(cairo, linkElem->getTypeName(), nullptr);
+#endif
+ }
+ return emitted;
+}
+
+void CairoOutputDev::getStructElemAttributeString(const StructElement *elem)
+{
+ int mcid = 0;
+ GooString attribs;
+ Ref ref = elem->getObjectRef();
+ attribs.appendf("id='{0:d}_{1:d}_{2:d}'", ref.num, ref.gen, mcid);
+ attribs.appendf(" parent='{0:d}_{1:d}'", ref.num, ref.gen);
+}
+
+int CairoOutputDev::getContentElementStructParents(const StructElement *element)
+{
+ int structParents = -1;
+ Ref ref;
+
+ if (element->hasStmRef()) {
+ element->getStmRef(ref);
+ Object xobjectObj = xref->fetch(ref);
+ const Object &spObj = xobjectObj.streamGetDict()->lookup("StructParents");
+ if (spObj.isInt()) {
+ structParents = spObj.getInt();
+ }
+ } else if (element->hasPageRef()) {
+ element->getPageRef(ref);
+ Object pageObj = xref->fetch(ref);
+ const Object &spObj = pageObj.dictLookup("StructParents");
+ if (spObj.isInt()) {
+ structParents = spObj.getInt();
+ }
+ }
+
+ if (structParents == -1) {
+ error(errSyntaxError, -1, "Unable to find StructParents object for StructElement");
+ }
+ return structParents;
+}
+
+bool CairoOutputDev::checkIfStructElementNeeded(const StructElement *element)
+{
+ if (element->isContent() && !element->isObjectRef()) {
+ int structParents = getContentElementStructParents(element);
+ int mcid = element->getMCID();
+ if (mcidEmitted.count(std::pair(structParents, mcid)) > 0) {
+ structElementNeeded.insert(element);
+ return true;
+ }
+ } else if (!element->isContent()) {
+ bool needed = false;
+ for (unsigned i = 0; i < element->getNumChildren(); i++) {
+ if (checkIfStructElementNeeded(element->getChild(i))) {
+ needed = true;
+ }
+ }
+ if (needed) {
+ structElementNeeded.insert(element);
+ }
+ return needed;
+ }
+ return false;
+}
+
+void CairoOutputDev::emitStructElement(const StructElement *element)
+{
+ if (structElementNeeded.count(element) == 0) {
+ return;
+ }
+
+#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0)
+ if (element->isContent() && !element->isObjectRef()) {
+ int structParents = getContentElementStructParents(element);
+ int mcid = element->getMCID();
+ GooString attribs;
+ attribs.appendf("ref='{0:d}_{1:d}'", structParents, mcid);
+ cairo_tag_begin(cairo, CAIRO_TAG_CONTENT_REF, attribs.c_str());
+ cairo_tag_end(cairo, CAIRO_TAG_CONTENT_REF);
+ } else if (!element->isContent()) {
+ if (element->getType() == StructElement::Link) {
+ bool ok = beginLink(element);
+ if (!ok) {
+ return;
+ }
+ } else {
+ cairo_tag_begin(cairo, element->getTypeName(), "");
+ }
+ for (unsigned i = 0; i < element->getNumChildren(); i++) {
+ emitStructElement(element->getChild(i));
+ }
+ cairo_tag_end(cairo, element->getTypeName());
+ }
+#endif
+}
+
+void CairoOutputDev::emitStructTree()
+{
+ if (logicalStruct && isPDF()) {
+ const StructTreeRoot *root = doc->getStructTreeRoot();
+ if (!root) {
+ return;
+ }
+
+ for (unsigned i = 0; i < root->getNumChildren(); i++) {
+ checkIfStructElementNeeded(root->getChild(i));
+ }
+
+ for (unsigned i = 0; i < root->getNumChildren(); i++) {
+ emitStructElement(root->getChild(i));
+ }
+ }
+}
+
void CairoOutputDev::startType3Render(GfxState *state, XRef *xrefA)
{
/* When cairo calls a user font render function, the default
@@ -3468,6 +3884,58 @@ void CairoOutputDev::drawImage(GfxState *state, Object *ref, Stream *str, int wi
cairo_pattern_destroy(pattern);
}
+void CairoOutputDev::beginMarkedContent(const char *name, Dict *properties)
+{
+ if (!logicalStruct || !isPDF()) {
+ return;
+ }
+
+ if (strcmp(name, "Artifact") == 0) {
+ markedContentStack.emplace_back(name);
+#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0)
+ cairo_tag_begin(cairo, name, nullptr);
+#endif
+ return;
+ }
+
+ int mcid = -1;
+ if (properties) {
+ properties->lookupInt("MCID", nullptr, &mcid);
+ }
+
+ if (mcid == -1) {
+ return;
+ }
+
+ GooString attribs;
+ attribs.appendf("tag_name='{0:s}' id='{1:d}_{2:d}'", name, currentStructParents, mcid);
+ mcidEmitted.insert(std::pair<int, int>(currentStructParents, mcid));
+
+ std::string tag;
+#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0)
+ tag = CAIRO_TAG_CONTENT;
+ cairo_tag_begin(cairo, CAIRO_TAG_CONTENT, attribs.c_str());
+#endif
+
+ markedContentStack.push_back(tag);
+}
+
+void CairoOutputDev::endMarkedContent(GfxState *state)
+{
+ if (!logicalStruct || !isPDF()) {
+ return;
+ }
+
+ if (markedContentStack.size() == 0) {
+ return;
+ }
+
+#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0)
+ cairo_tag_end(cairo, markedContentStack.back().c_str());
+#endif
+ markedContentStack.pop_back();
+}
+
//------------------------------------------------------------------------
// ImageOutputDev
//------------------------------------------------------------------------
diff --git a/poppler/CairoOutputDev.h b/poppler/CairoOutputDev.h
index c4c5f8d0..a955115e 100644
--- a/poppler/CairoOutputDev.h
+++ b/poppler/CairoOutputDev.h
@@ -37,10 +37,16 @@
#ifndef CAIROOUTPUTDEV_H
#define CAIROOUTPUTDEV_H
+#include <unordered_set>
+
#include <cairo-ft.h>
#include "OutputDev.h"
#include "TextOutputDev.h"
#include "GfxState.h"
+#include "StructElement.h"
+#include "StructTreeRoot.h"
+#include "Annot.h"
+#include "Link.h"
class PDFDoc;
class GfxState;
@@ -143,6 +149,12 @@ public:
// End a page.
void endPage() override;
+ // Must be called before last call to endPage()
+ void emitStructTree();
+
+ void beginForm(Object *obj, Ref id) override;
+ void endForm(Object *obj, Ref id) override;
+
//----- save/restore graphics state
void saveState(GfxState *state) override;
void restoreState(GfxState *state) override;
@@ -201,6 +213,9 @@ public:
void beginTextObject(GfxState *state) override;
void endTextObject(GfxState *state) override;
+ void beginMarkedContent(const char *name, Dict *properties) override;
+ void endMarkedContent(GfxState *state) override;
+
//----- image drawing
void drawImageMask(GfxState *state, Object *ref, Stream *str, int width, int height, bool invert, bool interpolate, bool inlineImg) override;
void setSoftMaskFromImageMask(GfxState *state, Object *ref, Stream *str, int width, int height, bool invert, bool inlineImg, double *baseMatrix) override;
@@ -244,6 +259,7 @@ public:
needFontUpdate = true;
}
void copyAntialias(cairo_t *cr, cairo_t *source_cr);
+ void setLogicalStructure(bool logStruct) { this->logicalStruct = logStruct; }
enum Type3RenderType
{
@@ -270,12 +286,25 @@ protected:
void setMimeData(GfxState *state, Stream *str, Object *ref, GfxImageColorMap *colorMap, cairo_surface_t *image, int height);
void fillToStrokePathClip(GfxState *state);
void alignStrokeCoords(const GfxSubpath *subpath, int i, double *x, double *y);
+ AnnotLink *findLinkObject(const StructElement *elem);
+ void quadToCairoRect(AnnotQuadrilaterals *quads, int idx, double destPageHeight, cairo_rectangle_t *rect);
+ bool appendLinkDestRef(GooString *s, const LinkDest *dest);
+ void appendLinkDestXY(GooString *s, const LinkDest *dest, double destPageHeight);
+ bool beginLinkTag(AnnotLink *annotLink);
+ bool beginLink(const StructElement *linkElem);
+ void getStructElemAttributeString(const StructElement *elem);
+ int getContentElementStructParents(const StructElement *element);
+ bool checkIfStructElementNeeded(const StructElement *element);
+ void emitStructElement(const StructElement *elem);
+ void startFirstPage(int pageNum, GfxState *state, XRef *xrefA);
#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 14, 0)
bool setMimeDataForJBIG2Globals(Stream *str, cairo_surface_t *image);
#endif
#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 15, 10)
bool setMimeDataForCCITTParams(Stream *str, cairo_surface_t *image, int height);
#endif
+ static void textStringToQuotedUtf8(const GooString *text, GooString *s);
+ bool isPDF();
std::optional<GfxRGB> fill_color, stroke_color;
cairo_pattern_t *fill_pattern, *stroke_pattern;
@@ -331,6 +360,14 @@ protected:
bool has_color;
double t3_glyph_bbox[4];
bool prescaleImages;
+ bool logicalStruct;
+ bool firstPage;
+ int pdfPageNum; // page number of the PDF file
+ int cairoPageNum; // page number in cairo output
+ std::vector<std::string> markedContentStack;
+ std::vector<Annot *> annotations;
+ std::set<std::string> emittedDestinations;
+ std::map<int, int> pdfPageToCairoPageMap;
TextPage *textPage; // text for the current page
ActualText *actualText;
@@ -361,6 +398,19 @@ protected:
Ref fontRef;
};
std::vector<SaveStateElement> saveStateStack;
+
+ std::map<Ref, std::map<std::string, std::unique_ptr<LinkDest>>> destsMap;
+ std::map<Ref, int> pdfPageRefToCairoPageNumMap;
+ std::vector<int> structParentsStack;
+ int currentStructParents;
+
+ struct StructParentsMcidHash
+ {
+ size_t operator()(std::pair<int, int> x) const { return x.first << 16 | x.second; }
+ };
+ std::unordered_set<std::pair<int, int>, StructParentsMcidHash> mcidEmitted; // <structParent, MCID>
+
+ std::unordered_set<const StructElement *> structElementNeeded;
};
//------------------------------------------------------------------------
diff --git a/poppler/Gfx.cc b/poppler/Gfx.cc
index 00662256..3fa56aa4 100644
--- a/poppler/Gfx.cc
+++ b/poppler/Gfx.cc
@@ -4135,9 +4135,9 @@ void Gfx::opXObject(Object args[], int numArgs)
out->drawForm(refObj.getRef());
} else {
Ref ref = refObj.isRef() ? refObj.getRef() : Ref::INVALID();
- out->beginForm(ref);
+ out->beginForm(&obj1, ref);
doForm(&obj1);
- out->endForm(ref);
+ out->endForm(&obj1, ref);
}
}
if (refObj.isRef() && shouldDoForm) {
diff --git a/poppler/MarkedContentOutputDev.cc b/poppler/MarkedContentOutputDev.cc
index e3a4cdce..63b17446 100644
--- a/poppler/MarkedContentOutputDev.cc
+++ b/poppler/MarkedContentOutputDev.cc
@@ -56,12 +56,12 @@ void MarkedContentOutputDev::endPage()
pageWidth = pageHeight = 0.0;
}
-void MarkedContentOutputDev::beginForm(Ref id)
+void MarkedContentOutputDev::beginForm(Object * /* obj */, Ref id)
{
formStack.push_back(id);
}
-void MarkedContentOutputDev::endForm(Ref id)
+void MarkedContentOutputDev::endForm(Object * /* obj */, Ref id)
{
formStack.pop_back();
}
diff --git a/poppler/MarkedContentOutputDev.h b/poppler/MarkedContentOutputDev.h
index dad39a77..d6584b70 100644
--- a/poppler/MarkedContentOutputDev.h
+++ b/poppler/MarkedContentOutputDev.h
@@ -100,8 +100,8 @@ public:
void startPage(int pageNum, GfxState *state, XRef *xref) override;
void endPage() override;
- void beginForm(Ref id) override;
- void endForm(Ref id) override;
+ void beginForm(Object * /* obj */, Ref id) override;
+ void endForm(Object * /* obj */, Ref id) override;
void drawChar(GfxState *state, double xx, double yy, double dx, double dy, double ox, double oy, CharCode c, int nBytes, const Unicode *u, int uLen) override;
diff --git a/poppler/OutputDev.h b/poppler/OutputDev.h
index 67c21618..02438ab0 100644
--- a/poppler/OutputDev.h
+++ b/poppler/OutputDev.h
@@ -320,9 +320,9 @@ public:
virtual void type3D1(GfxState * /*state*/, double /*wx*/, double /*wy*/, double /*llx*/, double /*lly*/, double /*urx*/, double /*ury*/) { }
//----- form XObjects
- virtual void beginForm(Ref /*id*/) { }
+ virtual void beginForm(Object * /* obj */, Ref /*id*/) { }
virtual void drawForm(Ref /*id*/) { }
- virtual void endForm(Ref /*id*/) { }
+ virtual void endForm(Object * /* obj */, Ref /*id*/) { }
//----- PostScript XObjects
virtual void psXObject(Stream * /*psStream*/, Stream * /*level1Stream*/) { }
diff --git a/poppler/Page.cc b/poppler/Page.cc
index 9d5a4ffb..99639f3d 100644
--- a/poppler/Page.cc
+++ b/poppler/Page.cc
@@ -259,6 +259,7 @@ Page::Page(PDFDoc *docA, int numA, Object &&pageDict, Ref pageRefA, PageAttrs *a
num = numA;
duration = -1;
annots = nullptr;
+ structParents = -1;
pageObj = std::move(pageDict);
@@ -281,6 +282,14 @@ Page::Page(PDFDoc *docA, int numA, Object &&pageDict, Ref pageRefA, PageAttrs *a
duration = tmp.getNum();
}
+ // structParents
+ const Object &tmp2 = pageObj.dictLookup("StructParents");
+ if (!(tmp2.isInt() || tmp2.isNull())) {
+ error(errSyntaxError, -1, "Page StructParents object (page {0:d}) is wrong type ({1:s})", num, tmp2.getTypeName());
+ } else if (tmp2.isInt()) {
+ structParents = tmp2.getInt();
+ }
+
// annotations
annotsObj = pageObj.dictLookupNF("Annots").copy();
if (!(annotsObj.isRef() || annotsObj.isArray() || annotsObj.isNull())) {
diff --git a/poppler/Page.h b/poppler/Page.h
index 3fe86be7..fdd4c260 100644
--- a/poppler/Page.h
+++ b/poppler/Page.h
@@ -242,6 +242,10 @@ public:
bool hasStandaloneFields() const { return !standaloneFields.empty(); }
+ // Get the integer key of the page's entry in the structural parent tree.
+ // Returns -1 if the page dict does not contain a StructParents key.
+ int getStructParents() const { return structParents; }
+
private:
// replace xref
void replaceXRef(XRef *xrefA);
@@ -259,6 +263,7 @@ private:
Object trans; // page transition
Object actions; // page additional actions
double duration; // page duration
+ int structParents; // integer key of page in structure parent tree
bool ok; // true if page is valid
mutable std::recursive_mutex mutex;
// standalone widgets are special FormWidget's inside a Page that *are not*
diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc
index 9b722c19..35babbba 100644
--- a/poppler/StructElement.cc
+++ b/poppler/StructElement.cc
@@ -779,6 +779,15 @@ bool StructElement::getPageRef(Ref &ref) const
return false;
}
+bool StructElement::getStmRef(Ref &ref) const
+{
+ if (stmRef.isRef()) {
+ ref = stmRef.getRef();
+ return true;
+ }
+ return false;
+}
+
const char *StructElement::getTypeName() const
{
return typeToName(type);
diff --git a/poppler/StructElement.h b/poppler/StructElement.h
index 2244fbe2..d083735d 100644
--- a/poppler/StructElement.h
+++ b/poppler/StructElement.h
@@ -242,9 +242,12 @@ public:
int getMCID() const { return c->mcid; }
Ref getObjectRef() const { return c->ref; }
- Ref getParentRef() { return isContent() ? parent->getParentRef() : s->parentRef; }
+ Ref getParentRef() const { return isContent() ? parent->getParentRef() : s->parentRef; }
+ StructElement *getParent() const { return parent; } // returns NULL if parent is StructTreeRoot
bool hasPageRef() const;
bool getPageRef(Ref &ref) const;
+ bool hasStmRef() const { return stmRef.isRef(); }
+ bool getStmRef(Ref &ref) const;
StructTreeRoot *getStructTreeRoot() { return treeRoot; }
// Optional element identifier.
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index cce313ef..9216e61f 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -561,3 +561,35 @@ void unicodeToAscii7(const Unicode *in, int len, Unicode **ucs4_out, int *out_le
*indices = idx;
}
}
+
+// Convert a PDF Text String to UTF-8
+// textStr - PDF text string
+// returns UTF-8 string.
+std::string TextStringToUtf8(const std::string &textStr)
+{
+ int i, len;
+ const char *s;
+ char *utf8;
+
+ len = textStr.size();
+ s = textStr.c_str();
+ if (GooString::hasUnicodeMarker(textStr)) {
+ uint16_t *utf16;
+ len = len / 2 - 1;
+ utf16 = new uint16_t[len];
+ for (i = 0; i < len; i++) {
+ utf16[i] = (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff);
+ }
+ utf8 = utf16ToUtf8(utf16, &len);
+ delete[] utf16;
+ } else {
+ utf8 = (char *)gmalloc(len + 1);
+ for (i = 0; i < len; i++) {
+ utf8[i] = pdfDocEncoding[s[i] & 0xff];
+ }
+ utf8[i] = 0;
+ }
+ std::string utf8_string(utf8);
+ gfree(utf8);
+ return utf8_string;
+}
diff --git a/poppler/UTF.h b/poppler/UTF.h
index bd04d05e..598c2e45 100644
--- a/poppler/UTF.h
+++ b/poppler/UTF.h
@@ -125,4 +125,9 @@ char POPPLER_PRIVATE_EXPORT *utf16ToUtf8(const uint16_t *utf16, int *len = nullp
// being passed in @in_idx parameter).
void POPPLER_PRIVATE_EXPORT unicodeToAscii7(const Unicode *in, int len, Unicode **ucs4_out, int *out_len, const int *in_idx, int **indices);
+// Convert a PDF Text String to UTF-8
+// textStr - PDF text string
+// returns UTF-8 string.
+std::string POPPLER_PRIVATE_EXPORT TextStringToUtf8(const std::string &textStr);
+
#endif
diff --git a/utils/pdftocairo.1 b/utils/pdftocairo.1
index 8de23925..782f81fd 100644
--- a/utils/pdftocairo.1
+++ b/utils/pdftocairo.1
@@ -211,6 +211,10 @@ Generate Level 2 PostScript (PS only).
Generate Level 3 PostScript (PS only). This enables all Level 2 features plus
shading patterns and masked images. This is the default setting.
.TP
+.B \-struct
+If the input file contains structural information about the document's content,
+write this information to the output file (PDF only).
+.TP
.B \-origpagesizes
This option is the same as "\-paper match".
.TP
diff --git a/utils/pdftocairo.cc b/utils/pdftocairo.cc
index faa5a02d..f16f03e4 100644
--- a/utils/pdftocairo.cc
+++ b/utils/pdftocairo.cc
@@ -132,6 +132,7 @@ static bool noShrink = false;
static bool noCenter = false;
static bool duplex = false;
static char tiffCompressionStr[16] = "";
+static bool docStruct = false;
static char ownerPassword[33] = "";
static char userPassword[33] = "";
@@ -220,6 +221,10 @@ static const ArgDesc argDesc[] = {
{ "-nocenter", argFlag, &noCenter, 0, "don't center pages smaller than the paper size" },
{ "-duplex", argFlag, &duplex, 0, "enable duplex printing" },
+#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 18, 0)
+ { "-struct", argFlag, &docStruct, 0, "enable logical document structure" },
+#endif
+
{ "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" },
{ "-upw", argString, userPassword, sizeof(userPassword), "user password (for encrypted files)" },
@@ -725,11 +730,25 @@ static void renderPage(PDFDoc *doc, CairoOutputDev *cairoOut, int pg, double pag
cairo_destroy(cr);
}
-static void endPage(GooString *imageFileName)
+static void endPage(GooString *imageFileName, CairoOutputDev *cairoOut, bool isLastPage)
{
cairo_status_t status;
+ cairo_t *cr;
if (printing) {
+ if (isLastPage) {
+ cr = cairo_create(surface);
+ cairoOut->setCairo(cr);
+ cairoOut->setPrinting(printing);
+ cairoOut->emitStructTree();
+ cairoOut->setCairo(nullptr);
+ status = cairo_status(cr);
+ if (status) {
+ fprintf(stderr, "cairo error: %s\n", cairo_status_to_string(status));
+ }
+ cairo_destroy(cr);
+ }
+
cairo_surface_show_page(surface);
#ifdef CAIRO_HAS_WIN32_SURFACE
@@ -907,7 +926,6 @@ int main(int argc, char *argv[])
int pg, pg_num_len;
double pg_w, pg_h, tmp, output_w, output_h;
int num_outputs;
- bool documentInitialized = false;
// parse args
Win32Console win32Console(&argc, &argv);
@@ -1026,6 +1044,10 @@ int main(int argc, char *argv[])
level3 = true;
}
+ if (docStruct && !pdf) {
+ fprintf(stderr, "Error: -struct may only be used with pdf or output.\n");
+ exit(99);
+ }
if (eps && (origPageSizes || paperSize[0] || paperWidth > 0 || paperHeight > 0)) {
fprintf(stderr, "Error: page size options may not be used with eps output.\n");
exit(99);
@@ -1145,8 +1167,15 @@ int main(int argc, char *argv[])
// If our page range selection and document size indicate we're only
// outputting a single page, ensure that even/odd page selection doesn't
- // filter out that single page.
- if (firstPage == lastPage && ((printOnlyEven && firstPage % 2 == 1) || (printOnlyOdd && firstPage % 2 == 0))) {
+ // filter out that single page. Also adjust first and last page so there are no pages
+ // skipped at the start or end of the for loop.
+ if ((printOnlyEven && firstPage % 2 == 1) || (printOnlyOdd && firstPage % 2 == 0)) {
+ firstPage++;
+ }
+ if ((printOnlyEven && lastPage % 2 == 1) || (printOnlyOdd && lastPage % 2 == 0)) {
+ lastPage--;
+ }
+ if (lastPage < firstPage) {
fprintf(stderr, "Invalid even/odd page selection, no pages match criteria.\n");
exit(99);
}
@@ -1174,6 +1203,8 @@ int main(int argc, char *argv[])
#endif
cairoOut = new CairoOutputDev();
+ cairoOut->setLogicalStructure(docStruct);
+
#ifdef USE_CMS
cairoOut->setDisplayProfile(profile);
#endif
@@ -1197,7 +1228,7 @@ int main(int argc, char *argv[])
pg_h = doc->getPageMediaHeight(pg);
}
- if (printing && !documentInitialized) {
+ if (printing && pg == firstPage) {
if (paperWidth < 0 || paperHeight < 0) {
paperWidth = (int)ceil(pg_w);
paperHeight = (int)ceil(pg_h);
@@ -1235,13 +1266,12 @@ int main(int argc, char *argv[])
}
getOutputSize(pg_w, pg_h, &output_w, &output_h);
- if (!documentInitialized) {
+ if (pg == firstPage) {
beginDocument(fileName, outputFileName, output_w, output_h);
- documentInitialized = true;
}
beginPage(&output_w, &output_h);
renderPage(doc.get(), cairoOut, pg, pg_w, pg_h, output_w, output_h);
- endPage(imageFileName);
+ endPage(imageFileName, cairoOut, pg == lastPage);
}
endDocument();