From 9d03df990070b6e96cfb22d4413fd3e5d6a15763 Mon Sep 17 00:00:00 2001 From: Ashod Nakashian Date: Sun, 22 Apr 2018 10:48:51 -0400 Subject: svx: import processed PDF text Some PDFs don't include spaces in the text. Instead, they rely on the explicit positioning of each character to render visually separated words. Latex seems to be prone to this approach, though not exclusively. Luckily, PDFium does process text and inserts "generated" spaces where necessary, which is what we retrieve and use as the text string while importing. Change-Id: Ic21fe6c8416ecaba66f06b6260f1d6b040ff12af (cherry picked from commit da4b44d6afc01de4fb08251732ddcbdbd832b71f) --- external/pdfium/edit.patch.1 | 151 +++++++++++++++++++++++++++++++++++++++---- svx/source/svdraw/svdpdf.cxx | 30 +++++---- svx/source/svdraw/svdpdf.hxx | 12 ++-- 3 files changed, 165 insertions(+), 28 deletions(-) diff --git a/external/pdfium/edit.patch.1 b/external/pdfium/edit.patch.1 index 270dceb871b6..a110313017da 100644 --- a/external/pdfium/edit.patch.1 +++ b/external/pdfium/edit.patch.1 @@ -147,6 +147,56 @@ index 0a01ae0..6947e3a 100644 if (bPattern) { DrawTextPathWithPattern(textobj, pObj2Device, pFont, font_size, &text_matrix, bFill, bStroke); +diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp +index e712549..a7973f7 100644 +--- a/core/fpdftext/cpdf_textpage.cpp ++++ b/core/fpdftext/cpdf_textpage.cpp +@@ -1490,3 +1490,32 @@ bool CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1, + rect.Intersect(rect2); + return !rect.IsEmpty(); + } ++ ++WideString CPDF_TextPage::GetTextObjectText(CPDF_TextObject* pTextObj) ++{ ++ if (!m_bIsParsed) ++ return WideString(); ++ ++ float posy = 0; ++ bool IsContainPreChar = false; ++ bool IsAddLineFeed = false; ++ WideString strText; ++ for (const auto& charinfo : m_CharList) { ++ if (charinfo.m_pTextObj == pTextObj) { ++ IsContainPreChar = true; ++ IsAddLineFeed = false; ++ if (charinfo.m_Unicode) ++ strText += charinfo.m_Unicode; ++ } else if (charinfo.m_Unicode == 32) { ++ if (IsContainPreChar && charinfo.m_Unicode) { ++ strText += charinfo.m_Unicode; ++ IsContainPreChar = false; ++ IsAddLineFeed = false; ++ } ++ } else { ++ IsContainPreChar = false; ++ IsAddLineFeed = true; ++ } ++ } ++ return strText; ++} +diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h +index c87ab00..e5a1ba8 100644 +--- a/core/fpdftext/cpdf_textpage.h ++++ b/core/fpdftext/cpdf_textpage.h +@@ -110,6 +110,8 @@ class CPDF_TextPage { + WideString GetPageText(int start, int count) const; + WideString GetAllPageText() const { return GetPageText(0, CountChars()); } + ++ WideString GetTextObjectText(CPDF_TextObject* pTextObj); ++ + int CountRects(int start, int nCount); + bool GetRect(int rectIndex, CFX_FloatRect* pRect) const; + diff --git a/core/fxge/cfx_pathdata.cpp b/core/fxge/cfx_pathdata.cpp index 4ac5cf6..4286de4 100644 --- a/core/fxge/cfx_pathdata.cpp @@ -199,7 +249,7 @@ index 0d7ba56..37bdf99 100644 FPDFImageObj_GetImageDataDecoded(FPDF_PAGEOBJECT image_object, void* buffer, diff --git a/fpdfsdk/fpdfeditpage.cpp b/fpdfsdk/fpdfeditpage.cpp -index ca2cf3f..832a9ae 100644 +index ca2cf3f..2162625 100644 --- a/fpdfsdk/fpdfeditpage.cpp +++ b/fpdfsdk/fpdfeditpage.cpp @@ -11,12 +11,14 @@ @@ -217,7 +267,15 @@ index ca2cf3f..832a9ae 100644 #include "core/fpdfapi/page/cpdf_shadingobject.h" #include "core/fpdfapi/parser/cpdf_array.h" #include "core/fpdfapi/parser/cpdf_document.h" -@@ -363,3 +365,212 @@ FPDFPageObj_GetBounds(FPDF_PAGEOBJECT pageObject, +@@ -24,6 +26,7 @@ + #include "core/fpdfapi/parser/cpdf_string.h" + #include "core/fpdfdoc/cpdf_annot.h" + #include "core/fpdfdoc/cpdf_annotlist.h" ++#include "core/fpdftext/cpdf_textpage.h" + #include "fpdfsdk/fsdk_define.h" + #include "public/fpdf_formfill.h" + #include "third_party/base/logging.h" +@@ -363,3 +366,252 @@ FPDFPageObj_GetBounds(FPDF_PAGEOBJECT pageObject, *top = bbox.top; return true; } @@ -327,6 +385,46 @@ index ca2cf3f..832a9ae 100644 + return ret_count; +} + ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_GetTextProcessed(FPDF_PAGEOBJECT text_object, ++ FPDF_TEXTPAGE page, ++ int char_start, ++ int char_count, ++ unsigned short* result) ++{ ++ if (!page || !text_object || char_start < 0 || char_count < 0 || !result) ++ return 0; ++ ++ CPDF_TextObject* pTxtObj = CPDFTextObjectFromFPDFPageObject(text_object); ++ CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(page); ++ int char_available = textpage->CountChars() - char_start; ++ if (char_available <= 0) ++ return 0; ++ ++ char_count = std::min(char_count, char_available); ++ if (char_count == 0) { ++ // Writing out "", which has a character count of 1 due to the NUL. ++ *result = '\0'; ++ return 1; ++ } ++ ++ WideString str = textpage->GetTextObjectText(pTxtObj); ++ ++ if (str.GetLength() > static_cast(char_count)) ++ str = str.Left(static_cast(char_count)); ++ ++ // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected ++ // the number of items to stay the same. ++ ByteString byte_str = str.UTF16LE_Encode(); ++ size_t byte_str_len = byte_str.GetLength(); ++ constexpr size_t kBytesPerCharacter = sizeof(unsigned short); ++ int ret_count = byte_str_len / kBytesPerCharacter; ++ ++ ASSERT(ret_count <= char_count + 1); // +1 to account for the NUL terminator. ++ memcpy(result, byte_str.GetBuffer(byte_str_len), byte_str_len); ++ return ret_count; ++} ++ +FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV +FPDFTextObj_GetColor(FPDF_PAGEOBJECT text_object, + unsigned int* R, @@ -509,10 +607,21 @@ index a291987..0202284 100644 auto* pPathPoint = FXPathPointFromFPDFPathSegment(segment); if (!pPathPoint || !x || !y) diff --git a/fpdfsdk/fpdftext.cpp b/fpdfsdk/fpdftext.cpp -index 68bf4f8..e073b20 100644 +index 68bf4f8..1cac9c7 100644 --- a/fpdfsdk/fpdftext.cpp +++ b/fpdfsdk/fpdftext.cpp -@@ -105,6 +105,28 @@ FPDF_EXPORT double FPDF_CALLCONV FPDFText_GetFontSize(FPDF_TEXTPAGE text_page, +@@ -31,10 +31,6 @@ namespace { + + constexpr size_t kBytesPerCharacter = sizeof(unsigned short); + +-CPDF_TextPage* CPDFTextPageFromFPDFTextPage(FPDF_TEXTPAGE text_page) { +- return static_cast(text_page); +-} +- + CPDF_TextPageFind* CPDFTextPageFindFromFPDFSchHandle(FPDF_SCHHANDLE handle) { + return static_cast(handle); + } +@@ -105,6 +101,28 @@ FPDF_EXPORT double FPDF_CALLCONV FPDFText_GetFontSize(FPDF_TEXTPAGE text_page, return charinfo.m_FontSize; } @@ -542,13 +651,17 @@ index 68bf4f8..e073b20 100644 int index, double* left, diff --git a/fpdfsdk/fpdfview.cpp b/fpdfsdk/fpdfview.cpp -index e890aa0..b62283f 100644 +index e890aa0..09d2345 100644 --- a/fpdfsdk/fpdfview.cpp +++ b/fpdfsdk/fpdfview.cpp -@@ -336,6 +336,16 @@ CPDF_Page* CPDFPageFromFPDFPage(FPDF_PAGE page) { +@@ -336,6 +336,20 @@ CPDF_Page* CPDFPageFromFPDFPage(FPDF_PAGE page) { #endif // PDF_ENABLE_XFA } ++CPDF_TextPage* CPDFTextPageFromFPDFTextPage(FPDF_TEXTPAGE text_page) { ++ return static_cast(text_page); ++} ++ +CPDF_TextObject* CPDFTextObjectFromFPDFPageObject(FPDF_PAGEOBJECT page_object) { + auto* obj = CPDFPageObjectFromFPDFPageObject(page_object); + return obj ? obj->AsText() : nullptr; @@ -563,22 +676,25 @@ index e890aa0..b62283f 100644 auto* obj = CPDFPageObjectFromFPDFPageObject(page_object); return obj ? obj->AsPath() : nullptr; diff --git a/fpdfsdk/fsdk_define.h b/fpdfsdk/fsdk_define.h -index 77c2315..b61f447 100644 +index 77c2315..e9a309a 100644 --- a/fpdfsdk/fsdk_define.h +++ b/fpdfsdk/fsdk_define.h -@@ -25,6 +25,8 @@ class CPDF_Annot; +@@ -25,6 +25,9 @@ class CPDF_Annot; class CPDF_Page; class CPDF_PageObject; class CPDF_PageRenderContext; +class CPDF_TextObject; ++class CPDF_TextPage; +class CPDF_FormObject; class CPDF_PathObject; class CPDF_Stream; class IFSDK_PAUSE_Adapter; -@@ -65,6 +67,10 @@ FPDF_DOCUMENT FPDFDocumentFromCPDFDocument(CPDF_Document* doc); +@@ -65,6 +68,12 @@ FPDF_DOCUMENT FPDFDocumentFromCPDFDocument(CPDF_Document* doc); CPDF_Page* CPDFPageFromFPDFPage(FPDF_PAGE page); ++CPDF_TextPage* CPDFTextPageFromFPDFTextPage(FPDF_TEXTPAGE text_page); ++ +CPDF_TextObject* CPDFTextObjectFromFPDFPageObject(FPDF_PAGEOBJECT page_object); + +CPDF_FormObject* CPDFFormObjectFromFPDFPageObject(FPDF_PAGEOBJECT page_object); @@ -587,7 +703,7 @@ index 77c2315..b61f447 100644 CPDF_PageObject* CPDFPageObjectFromFPDFPageObject(FPDF_PAGEOBJECT page_object); diff --git a/public/fpdf_edit.h b/public/fpdf_edit.h -index 54735a3..2e7e2e7 100644 +index 54735a3..4d81aac 100644 --- a/public/fpdf_edit.h +++ b/public/fpdf_edit.h @@ -520,6 +520,15 @@ FPDFPath_GetStrokeColor(FPDF_PAGEOBJECT path, @@ -643,7 +759,7 @@ index 54735a3..2e7e2e7 100644 // Create a new text object using one of the standard PDF fonts. // // document - handle to the document. -@@ -761,6 +800,112 @@ FPDFPageObj_CreateTextObj(FPDF_DOCUMENT document, +@@ -761,6 +800,125 @@ FPDFPageObj_CreateTextObj(FPDF_DOCUMENT document, FPDF_FONT font, float font_size); @@ -702,6 +818,19 @@ index 54735a3..2e7e2e7 100644 + int char_count, + unsigned short* result); + ++// Get the processed text of a text object. ++// ++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj ++// or FPDFPageObj_NewTextObjEx. ++// Return Value: ++// The number of characters (not bytes) written in result. ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_GetTextProcessed(FPDF_PAGEOBJECT text_object, ++ FPDF_TEXTPAGE page, ++ int char_start, ++ int char_count, ++ unsigned short* result); ++ +// Get the stroke RGBA of a text. Range of values: 0 - 255. +// +// path - the handle to the path object. diff --git a/svx/source/svdraw/svdpdf.cxx b/svx/source/svdraw/svdpdf.cxx index ddb82cd9bf6b..a732989f092a 100644 --- a/svx/source/svdraw/svdpdf.cxx +++ b/svx/source/svdraw/svdpdf.cxx @@ -227,13 +227,18 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc << ", height: " << dPageHeight); SetupPageScale(dPageWidth, dPageHeight); + // Load the page text to extract it when we get text elements. + FPDF_TEXTPAGE pTextPage = FPDFText_LoadPage(pPdfPage); + const int nPageObjectCount = FPDFPage_CountObject(pPdfPage); for (int nPageObjectIndex = 0; nPageObjectIndex < nPageObjectCount; ++nPageObjectIndex) { FPDF_PAGEOBJECT pPageObject = FPDFPage_GetObject(pPdfPage, nPageObjectIndex); - ImportPdfObject(pPageObject, nPageObjectIndex); + ImportPdfObject(pPageObject, pTextPage, nPageObjectIndex); } + FPDFText_ClosePage(pTextPage); + #if 0 // Now do the text. FPDF_TEXTPAGE pTextPage = FPDFText_LoadPage(pPdfPage); @@ -993,8 +998,8 @@ void ImpSdrPdfImport::checkClip() } bool ImpSdrPdfImport::isClip() const { return !maClip.getB2DRange().isEmpty(); } - -void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex) +void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, + int nPageObjectIndex) { if (pPageObject == nullptr) return; @@ -1003,7 +1008,7 @@ void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObje switch (nPageObjectType) { case FPDF_PAGEOBJ_TEXT: - ImportText(pPageObject, nPageObjectIndex); + ImportText(pPageObject, pTextPage, nPageObjectIndex); break; case FPDF_PAGEOBJ_PATH: ImportPath(pPageObject, nPageObjectIndex); @@ -1015,7 +1020,7 @@ void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObje SAL_WARN("sd.filter", "Got page object SHADING: " << nPageObjectIndex); break; case FPDF_PAGEOBJ_FORM: - ImportForm(pPageObject, nPageObjectIndex); + ImportForm(pPageObject, pTextPage, nPageObjectIndex); break; default: SAL_WARN("sd.filter", "Unknown PDF page object #" << nPageObjectIndex @@ -1024,7 +1029,8 @@ void ImpSdrPdfImport::ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObje } } -void ImpSdrPdfImport::ImportForm(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex) +void ImpSdrPdfImport::ImportForm(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, + int nPageObjectIndex) { SAL_WARN("sd.filter", "Got page object FORM: " << nPageObjectIndex); @@ -1039,14 +1045,15 @@ void ImpSdrPdfImport::ImportForm(FPDF_PAGEOBJECT pPageObject, int nPageObjectInd for (int nIndex = 0; nIndex < nCount; ++nIndex) { FPDF_PAGEOBJECT pFormObject = FPDFFormObj_GetSubObject(pPageObject, nIndex); - ImportPdfObject(pFormObject, -1); + ImportPdfObject(pFormObject, pTextPage, -1); } // Restore the old one. mCurMatrix = aOldMatrix; } -void ImpSdrPdfImport::ImportText(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex) +void ImpSdrPdfImport::ImportText(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, + int nPageObjectIndex) { SAL_WARN("sd.filter", "Got page object TEXT: " << nPageObjectIndex); float left; @@ -1078,14 +1085,15 @@ void ImpSdrPdfImport::ImportText(FPDF_PAGEOBJECT pPageObject, int nPageObjectInd SAL_WARN("sd.filter", "Got TEXT origin: " << aPos); SAL_WARN("sd.filter", "Got TEXT Bounds: " << aRect); - const int nChars = FPDFTextObj_CountChars(pPageObject); + const int nChars = FPDFTextObj_CountChars(pPageObject) * 2; std::unique_ptr pText(new sal_Unicode[nChars + 1]); // + terminating null unsigned short* pShortText = reinterpret_cast(pText.get()); - const int nActualChars = FPDFTextObj_GetText(pPageObject, 0, nChars, pShortText); + const int nActualChars + = FPDFTextObj_GetTextProcessed(pPageObject, pTextPage, 0, nChars, pShortText); if (nActualChars <= 0) { - SAL_WARN("sd.filter", "Got not TEXT"); + SAL_WARN("sd.filter", "Got no TEXT"); return; } diff --git a/svx/source/svdraw/svdpdf.hxx b/svx/source/svdraw/svdpdf.hxx index d36c939be91a..4b2fba1bf8cd 100644 --- a/svx/source/svdraw/svdpdf.hxx +++ b/svx/source/svdraw/svdpdf.hxx @@ -42,6 +42,8 @@ class SdrObject; class SvdProgressInfo; typedef void* FPDF_DOCUMENT; typedef void* FPDF_PAGEOBJECT; // (text, path, etc.) +typedef void* FPDF_TEXTPAGE; + // Helper Class to import PDF class ImpSdrPdfImport final { @@ -85,7 +87,6 @@ class ImpSdrPdfImport final double d() const { return md; } double e() const { return me; } double f() const { return mf; } - /// Mutliply this * other. void Concatinate(const Matrix& other) { @@ -155,7 +156,6 @@ class ImpSdrPdfImport final /// Correct the vertical coordinate to start at the top. /// PDF coordinate system has orign at the bottom right. double correctVertOrigin(double offsetPts) const { return mdPageHeightPts - offsetPts; } - /// Convert PDF points to logic (twips). tools::Rectangle PointsToLogic(double left, double right, double top, double bottom) const; Point PointsToLogic(double x, double y) const; @@ -164,11 +164,12 @@ class ImpSdrPdfImport final void checkClip(); bool isClip() const; - void ImportPdfObject(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex); - void ImportForm(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex); + void ImportPdfObject(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, + int nPageObjectIndex); + void ImportForm(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, int nPageObjectIndex); void ImportImage(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex); void ImportPath(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex); - void ImportText(FPDF_PAGEOBJECT pPageObject, int nPageObjectIndex); + void ImportText(FPDF_PAGEOBJECT pPageObject, FPDF_TEXTPAGE pTextPage, int nPageObjectIndex); void ImportText(const Point& rPos, const Size& rSize, const OUString& rStr); void SetupPageScale(const double dPageWidth, const double dPageHeight); @@ -192,7 +193,6 @@ public: ~ImpSdrPdfImport(); int GetPageCount() const { return mnPageCount; } - size_t DoImport(SdrObjList& rDestList, size_t nInsPos, int nPageNumber, SvdProgressInfo* pProgrInfo = nullptr); }; -- cgit v1.2.3