summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAshod Nakashian <ashod.nakashian@collabora.co.uk>2018-04-08 23:38:55 -0400
committerJan Holesovsky <kendy@collabora.com>2018-06-06 12:48:26 +0200
commit3fdaadbf8c63c68a1de0043e3f0ea9e336d0470e (patch)
tree0a159210f020544ed465c660e309aba58e0341b7
parent9777681285c29accf041e771692687b0cc0ea4b7 (diff)
svx: more accurate PDF text importing
Change-Id: If37119510cbc091dc86cb5f699984186167745c7 (cherry picked from commit 7dc905d1e9b561bb71f58881190bb6f590d09d80)
-rw-r--r--external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2173
-rw-r--r--external/pdfium/UnpackedTarball_pdfium.mk1
-rw-r--r--svx/source/svdraw/svdpdf.cxx63
3 files changed, 236 insertions, 1 deletions
diff --git a/external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2 b/external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2
new file mode 100644
index 000000000000..ab5564a87353
--- /dev/null
+++ b/external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2
@@ -0,0 +1,173 @@
+From 5f83d0a3fac4f8ccef457c03b74433ffd7b12e2a Mon Sep 17 00:00:00 2001
+From: Ashod Nakashian <ashod.nakashian@collabora.co.uk>
+Date: Tue, 5 Jun 2018 11:28:30 +0200
+Subject: [PATCH 02/14] svx: more accurate PDF text importing
+
+---
+ pdfium/fpdfsdk/fpdf_editpage.cpp | 84 ++++++++++++++++++++++++++++++++++++++++
+ pdfium/public/fpdf_edit.h | 36 +++++++++++++++++
+ 2 files changed, 120 insertions(+)
+
+diff --git a/pdfium/fpdfsdk/fpdf_editpage.cpp b/pdfium/fpdfsdk/fpdf_editpage.cpp
+index 912df63..3244943 100644
+--- a/pdfium/fpdfsdk/fpdf_editpage.cpp
++++ b/pdfium/fpdfsdk/fpdf_editpage.cpp
+@@ -12,6 +12,7 @@
+ #include <vector>
+
+ #include "core/fpdfapi/edit/cpdf_pagecontentgenerator.h"
++#include "core/fpdfapi/font/cpdf_font.h"
+ #include "core/fpdfapi/page/cpdf_form.h"
+ #include "core/fpdfapi/page/cpdf_formobject.h"
+ #include "core/fpdfapi/page/cpdf_imageobject.h"
+@@ -626,6 +627,26 @@ FPDFPageObj_SetLineCap(FPDF_PAGEOBJECT page_object, int line_cap) {
+ return true;
+ }
+
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object)
++{
++ if (!text_object)
++ return 0;
++
++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++ return pTxtObj->CountChars();
++}
++
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object)
++{
++ if (!text_object)
++ return 0;
++
++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++ return pTxtObj->GetFontSize();
++}
++
+ FPDF_EXPORT void FPDF_CALLCONV
+ FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object,
+ double* a,
+@@ -642,3 +663,66 @@ FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object,
+ *c = matrix.c;
+ *d = matrix.d;
+ }
++
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetUnicode(FPDF_PAGEOBJECT text_object, int index)
++{
++ if (!text_object || index < 0)
++ return 0;
++
++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++ if (index > pTxtObj->CountChars())
++ return 0;
++
++ CPDF_TextObjectItem info;
++ pTxtObj->GetCharInfo(index, &info);
++ return info.m_CharCode;
++}
++
++FPDF_EXPORT int FPDF_CALLCONV FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
++ int char_start,
++ int char_count,
++ unsigned short* result) {
++ if (!text_object || char_start < 0 || char_count < 0 || !result)
++ return 0;
++
++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++ int char_available = pTxtObj->CountChars() - char_start;
++ if (char_available <= 0)
++ return 0;
++
++ char_count = std::min(char_count, char_available);
++ if (char_count == 0) {
++ // Writing out "", which has a character count of 1 due to the NUL.
++ *result = '\0';
++ return 1;
++ }
++
++ CPDF_Font* pFont = pTxtObj->GetFont();
++ WideString str;
++ for (uint32_t charcode : pTxtObj->GetCharCodes()) {
++ if (charcode != CPDF_Font::kInvalidCharCode)
++ str += pFont->UnicodeFromCharCode(charcode);
++ }
++
++// CFX_WideTextBuf m_TextBuf;
++// WideString str = textpage->GetPageText(char_start, char_count);
++// return WideString(m_TextBuf.AsStringView().Mid(
++// static_cast<size_t>(text_start), static_cast<size_t>(text_count)));
++
++// if (str.GetLength() > static_cast<size_t>(char_count))
++// str = str.Left(static_cast<size_t>(char_count));
++
++ // Reincode in UTF-16.
++// WideString str = text.UTF8Decode();
++
++ // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
++ // the number of items to stay the same.
++ ByteString byte_str = str.UTF16LE_Encode();
++ size_t byte_str_len = byte_str.GetLength();
++ int ret_count = byte_str_len / sizeof(unsigned short);
++
++ ASSERT(ret_count <= char_count + 1); // +1 to account for the NUL terminator.
++ memcpy(result, byte_str.GetBuffer(byte_str_len), byte_str_len);
++ return ret_count;
++}
+diff --git a/pdfium/public/fpdf_edit.h b/pdfium/public/fpdf_edit.h
+index 3f45495..602849f 100644
+--- a/pdfium/public/fpdf_edit.h
++++ b/pdfium/public/fpdf_edit.h
+@@ -971,6 +971,26 @@ FPDFPageObj_CreateTextObj(FPDF_DOCUMENT document,
+ FPDF_FONT font,
+ float font_size);
+
++// Get the number of characters from a text object.
++//
++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
++// or FPDFPageObj_NewTextObjEx.
++// Return Value:
++// A character count in the text object.
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object);
++
++
++// Get the font size of a text object.
++//
++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
++// or FPDFPageObj_NewTextObjEx.
++//
++// Return Value:
++// The value of the font size
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object);
++
+ // Get the matrix of a particular text object.
+ //
+ // text_object - Handle of text object returned by FPDFPageObj_NewTextObj
+@@ -986,6 +1006,22 @@ FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object,
+ double* c,
+ double* d);
+
++// Get the unicode of a special character in a text object.
++//
++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
++// or FPDFPageObj_NewTextObjEx.
++// index - The index of the character to get the unicode.
++// Return Value:
++// The unicode value.
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetUnicode(FPDF_PAGEOBJECT text_object, int index);
++
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
++ int char_start,
++ int char_count,
++ unsigned short* result);
++
+ #ifdef __cplusplus
+ } // extern "C"
+ #endif // __cplusplus
+--
+2.16.3
+
diff --git a/external/pdfium/UnpackedTarball_pdfium.mk b/external/pdfium/UnpackedTarball_pdfium.mk
index 58c014f41252..22e762695300 100644
--- a/external/pdfium/UnpackedTarball_pdfium.mk
+++ b/external/pdfium/UnpackedTarball_pdfium.mk
@@ -15,6 +15,7 @@ pdfium_patches += icu.patch.1
pdfium_patches += build.patch.1
# Adds missing editing API
pdfium_patches += 0001-svx-import-PDF-text-using-PDFium.patch.2
+pdfium_patches += 0002-svx-more-accurate-PDF-text-importing.patch.2
$(eval $(call gb_UnpackedTarball_UnpackedTarball,pdfium))
diff --git a/svx/source/svdraw/svdpdf.cxx b/svx/source/svdraw/svdpdf.cxx
index 89f89d0e7050..602e630563e8 100644
--- a/svx/source/svdraw/svdpdf.cxx
+++ b/svx/source/svdraw/svdpdf.cxx
@@ -236,8 +236,67 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc
switch (nPageObjectType)
{
case FPDF_PAGEOBJ_TEXT:
+ {
SAL_WARN("sd.filter", "Got page object TEXT");
- break;
+ float left;
+ float bottom;
+ float right;
+ float top;
+ if (!FPDFPageObj_GetBounds(pPageObject, &left, &bottom, &right, &top))
+ {
+ SAL_WARN("sd.filter", "FAILED to get TEXT bounds");
+ }
+
+ SAL_WARN("sd.filter", "Got TEXT bounds left: " << left << ", right: " << right
+ << ", top: " << top
+ << ", bottom: " << bottom);
+ Rectangle aRect = PointsToLogic(left, right, top, bottom);
+
+ double dFontScale = 1.0;
+ geometry::Matrix2D aMatrix;
+ FPDFTextObj_GetMatrix(pPageObject, &aMatrix.m00, &aMatrix.m01, &aMatrix.m10,
+ &aMatrix.m11);
+ if (aMatrix.m00 != aMatrix.m11 || aMatrix.m00 <= 0)
+ {
+ SAL_WARN("sd.filter", "Bogus font scale matrix ("
+ << aMatrix.m00 << ',' << aMatrix.m11
+ << "), will use heuristic height of "
+ << aRect.GetHeight() << ".");
+ dFontScale = aRect.GetHeight();
+ }
+ else
+ dFontScale = aMatrix.m00;
+
+ double dFontSize = FPDFTextObj_GetFontSize(pPageObject);
+ SAL_WARN("sd.filter", "Got Font Size: " << dFontSize);
+ dFontSize *= dFontScale;
+ SAL_WARN("sd.filter", "Got Font Size Scaled: " << dFontSize);
+ dFontSize = lcl_PointToPixel(dFontSize);
+ SAL_WARN("sd.filter", "Got Font Pixel Size: " << dFontSize);
+ dFontSize = lcl_ToLogic(dFontSize);
+ SAL_WARN("sd.filter", "Got Font Logic Size: " << dFontSize);
+ vcl::Font aFnt = mpVD->GetFont();
+ aFnt.SetFontSize(Size(dFontSize, dFontSize));
+ mpVD->SetFont(aFnt);
+
+ const int nChars = FPDFTextObj_CountChars(pPageObject);
+ std::unique_ptr<sal_Unicode[]> pText(
+ new sal_Unicode[nChars + 1]); // + terminating null
+
+ unsigned short* pShortText = reinterpret_cast<unsigned short*>(pText.get());
+ const int nActualChars
+ = FPDFTextObj_GetText(pPageObject, 0, nChars, pShortText);
+ OUString sText(pText.get(), nActualChars);
+
+ // for (int nChar = 0; nChar < nChars; ++nChar)
+ // pText[nChar] = static_cast<sal_Unicode>(FPDFTextObj_GetUnicode(pPageObject, nChar));
+ // OUString sText(pText.get(), nChars);
+ SAL_WARN("sd.filter", "Got Text #" << nPageObjectIndex + 1 << " (" << nChars
+ << "): [" << sText << "].");
+
+ ImportText(aRect.TopLeft(), sText);
+ }
+ break;
case FPDF_PAGEOBJ_PATH:
SAL_WARN("sd.filter", "Got page object PATH");
break;
@@ -256,6 +315,7 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc
}
}
+#if 0
// Now do the text.
FPDF_TEXTPAGE pTextPage = FPDFText_LoadPage(pPdfPage);
if (pTextPage != nullptr)
@@ -406,6 +466,7 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc
FPDFText_ClosePage(pTextPage);
}
+#endif
FPDF_ClosePage(pPdfPage);
}