diff options
author | Ashod Nakashian <ashod.nakashian@collabora.co.uk> | 2018-04-08 23:38:55 -0400 |
---|---|---|
committer | Jan Holesovsky <kendy@collabora.com> | 2018-05-22 12:17:12 +0200 |
commit | 7dc905d1e9b561bb71f58881190bb6f590d09d80 (patch) | |
tree | 68de1d3fa39467d6ae98578cbb20b83cdff1de42 | |
parent | 07cd74846d41779b5a315a081cdfb1db6b5e18a3 (diff) |
svx: more accurate PDF text importing
Change-Id: If37119510cbc091dc86cb5f699984186167745c7
-rw-r--r-- | external/pdfium/edit.patch.1 | 136 | ||||
-rw-r--r-- | svx/source/svdraw/svdpdf.cxx | 63 |
2 files changed, 193 insertions, 6 deletions
diff --git a/external/pdfium/edit.patch.1 b/external/pdfium/edit.patch.1 index 78cf4c3394c6..b7cd86e5ff2d 100644 --- a/external/pdfium/edit.patch.1 +++ b/external/pdfium/edit.patch.1 @@ -35,10 +35,17 @@ index 0a01ae0..fad2920 100644 DrawTextPathWithPattern(textobj, pObj2Device, pFont, font_size, &text_matrix, bFill, bStroke); diff --git a/fpdfsdk/fpdfeditpage.cpp b/fpdfsdk/fpdfeditpage.cpp -index ca2cf3f..ef4b958 100644 +index ca2cf3f..ac36788 100644 --- a/fpdfsdk/fpdfeditpage.cpp +++ b/fpdfsdk/fpdfeditpage.cpp -@@ -17,6 +17,7 @@ +@@ -11,12 +11,14 @@ + #include <utility> + + #include "core/fpdfapi/edit/cpdf_pagecontentgenerator.h" ++#include "core/fpdfapi/font/cpdf_font.h" + #include "core/fpdfapi/page/cpdf_form.h" + #include "core/fpdfapi/page/cpdf_formobject.h" + #include "core/fpdfapi/page/cpdf_imageobject.h" #include "core/fpdfapi/page/cpdf_page.h" #include "core/fpdfapi/page/cpdf_pageobject.h" #include "core/fpdfapi/page/cpdf_pathobject.h" @@ -46,11 +53,31 @@ index ca2cf3f..ef4b958 100644 #include "core/fpdfapi/page/cpdf_shadingobject.h" #include "core/fpdfapi/parser/cpdf_array.h" #include "core/fpdfapi/parser/cpdf_document.h" -@@ -363,3 +364,20 @@ FPDFPageObj_GetBounds(FPDF_PAGEOBJECT pageObject, +@@ -363,3 +365,103 @@ FPDFPageObj_GetBounds(FPDF_PAGEOBJECT pageObject, *top = bbox.top; return true; } + ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object) ++{ ++ if (!text_object) ++ return 0; ++ ++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object); ++ return pTxtObj->CountChars(); ++} ++ ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object) ++{ ++ if (!text_object) ++ return 0; ++ ++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object); ++ return pTxtObj->GetFontSize(); ++} ++ +FPDF_EXPORT void FPDF_CALLCONV +FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object, + double* a, @@ -67,6 +94,69 @@ index ca2cf3f..ef4b958 100644 + *c = matrix.c; + *d = matrix.d; +} ++ ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_GetUnicode(FPDF_PAGEOBJECT text_object, int index) ++{ ++ if (!text_object || index < 0) ++ return 0; ++ ++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object); ++ if (index > pTxtObj->CountChars()) ++ return 0; ++ ++ CPDF_TextObjectItem info; ++ pTxtObj->GetCharInfo(index, &info); ++ return info.m_CharCode; ++} ++ ++FPDF_EXPORT int FPDF_CALLCONV FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object, ++ int char_start, ++ int char_count, ++ unsigned short* result) { ++ if (!text_object || char_start < 0 || char_count < 0 || !result) ++ return 0; ++ ++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object); ++ int char_available = pTxtObj->CountChars() - char_start; ++ if (char_available <= 0) ++ return 0; ++ ++ char_count = std::min(char_count, char_available); ++ if (char_count == 0) { ++ // Writing out "", which has a character count of 1 due to the NUL. ++ *result = '\0'; ++ return 1; ++ } ++ ++ CPDF_Font* pFont = pTxtObj->GetFont(); ++ WideString str; ++ for (uint32_t charcode : pTxtObj->GetCharCodes()) { ++ if (charcode != CPDF_Font::kInvalidCharCode) ++ str += pFont->UnicodeFromCharCode(charcode); ++ } ++ ++// CFX_WideTextBuf m_TextBuf; ++// WideString str = textpage->GetPageText(char_start, char_count); ++// return WideString(m_TextBuf.AsStringView().Mid( ++// static_cast<size_t>(text_start), static_cast<size_t>(text_count))); ++ ++// if (str.GetLength() > static_cast<size_t>(char_count)) ++// str = str.Left(static_cast<size_t>(char_count)); ++ ++ // Reincode in UTF-16. ++// WideString str = text.UTF8Decode(); ++ ++ // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected ++ // the number of items to stay the same. ++ ByteString byte_str = str.UTF16LE_Encode(); ++ size_t byte_str_len = byte_str.GetLength(); ++ int ret_count = byte_str_len / sizeof(unsigned short); ++ ++ ASSERT(ret_count <= char_count + 1); // +1 to account for the NUL terminator. ++ memcpy(result, byte_str.GetBuffer(byte_str_len), byte_str_len); ++ return ret_count; ++} diff --git a/fpdfsdk/fpdftext.cpp b/fpdfsdk/fpdftext.cpp index 68bf4f8..e073b20 100644 --- a/fpdfsdk/fpdftext.cpp @@ -101,13 +191,33 @@ index 68bf4f8..e073b20 100644 int index, double* left, diff --git a/public/fpdf_edit.h b/public/fpdf_edit.h -index 54735a3..3642a2a 100644 +index 54735a3..a9c1a25 100644 --- a/public/fpdf_edit.h +++ b/public/fpdf_edit.h -@@ -761,6 +761,21 @@ FPDFPageObj_CreateTextObj(FPDF_DOCUMENT document, +@@ -761,6 +761,57 @@ FPDFPageObj_CreateTextObj(FPDF_DOCUMENT document, FPDF_FONT font, float font_size); ++// Get the number of characters from a text object. ++// ++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj ++// or FPDFPageObj_NewTextObjEx. ++// Return Value: ++// A character count in the text object. ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object); ++ ++ ++// Get the font size of a text object. ++// ++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj ++// or FPDFPageObj_NewTextObjEx. ++// ++// Return Value: ++// The value of the font size ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object); ++ +// Get the matrix of a particular text object. +// +// text_object - Handle of text object returned by FPDFPageObj_NewTextObj @@ -123,6 +233,22 @@ index 54735a3..3642a2a 100644 + double* c, + double* d); + ++// Get the unicode of a special character in a text object. ++// ++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj ++// or FPDFPageObj_NewTextObjEx. ++// index - The index of the character to get the unicode. ++// Return Value: ++// The unicode value. ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_GetUnicode(FPDF_PAGEOBJECT text_object, int index); ++ ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object, ++ int char_start, ++ int char_count, ++ unsigned short* result); ++ #ifdef __cplusplus } // extern "C" #endif // __cplusplus diff --git a/svx/source/svdraw/svdpdf.cxx b/svx/source/svdraw/svdpdf.cxx index 85bb13b86480..ef1cea9683fb 100644 --- a/svx/source/svdraw/svdpdf.cxx +++ b/svx/source/svdraw/svdpdf.cxx @@ -233,8 +233,67 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc switch (nPageObjectType) { case FPDF_PAGEOBJ_TEXT: + { SAL_WARN("sd.filter", "Got page object TEXT"); - break; + float left; + float bottom; + float right; + float top; + if (!FPDFPageObj_GetBounds(pPageObject, &left, &bottom, &right, &top)) + { + SAL_WARN("sd.filter", "FAILED to get TEXT bounds"); + } + + SAL_WARN("sd.filter", "Got TEXT bounds left: " << left << ", right: " << right + << ", top: " << top + << ", bottom: " << bottom); + Rectangle aRect = PointsToLogic(left, right, top, bottom); + + double dFontScale = 1.0; + geometry::Matrix2D aMatrix; + FPDFTextObj_GetMatrix(pPageObject, &aMatrix.m00, &aMatrix.m01, &aMatrix.m10, + &aMatrix.m11); + if (aMatrix.m00 != aMatrix.m11 || aMatrix.m00 <= 0) + { + SAL_WARN("sd.filter", "Bogus font scale matrix (" + << aMatrix.m00 << ',' << aMatrix.m11 + << "), will use heuristic height of " + << aRect.GetHeight() << "."); + dFontScale = aRect.GetHeight(); + } + else + dFontScale = aMatrix.m00; + + double dFontSize = FPDFTextObj_GetFontSize(pPageObject); + SAL_WARN("sd.filter", "Got Font Size: " << dFontSize); + dFontSize *= dFontScale; + SAL_WARN("sd.filter", "Got Font Size Scaled: " << dFontSize); + dFontSize = lcl_PointToPixel(dFontSize); + SAL_WARN("sd.filter", "Got Font Pixel Size: " << dFontSize); + dFontSize = lcl_ToLogic(dFontSize); + SAL_WARN("sd.filter", "Got Font Logic Size: " << dFontSize); + vcl::Font aFnt = mpVD->GetFont(); + aFnt.SetFontSize(Size(dFontSize, dFontSize)); + mpVD->SetFont(aFnt); + + const int nChars = FPDFTextObj_CountChars(pPageObject); + std::unique_ptr<sal_Unicode[]> pText( + new sal_Unicode[nChars + 1]); // + terminating null + + unsigned short* pShortText = reinterpret_cast<unsigned short*>(pText.get()); + const int nActualChars + = FPDFTextObj_GetText(pPageObject, 0, nChars, pShortText); + OUString sText(pText.get(), nActualChars); + + // for (int nChar = 0; nChar < nChars; ++nChar) + // pText[nChar] = static_cast<sal_Unicode>(FPDFTextObj_GetUnicode(pPageObject, nChar)); + // OUString sText(pText.get(), nChars); + SAL_WARN("sd.filter", "Got Text #" << nPageObjectIndex + 1 << " (" << nChars + << "): [" << sText << "]."); + + ImportText(aRect.TopLeft(), sText); + } + break; case FPDF_PAGEOBJ_PATH: SAL_WARN("sd.filter", "Got page object PATH"); break; @@ -253,6 +312,7 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc } } +#if 0 // Now do the text. FPDF_TEXTPAGE pTextPage = FPDFText_LoadPage(pPdfPage); if (pTextPage != nullptr) @@ -403,6 +463,7 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc FPDFText_ClosePage(pTextPage); } +#endif FPDF_ClosePage(pPdfPage); } |