svx: more accurate PDF text importing

Change-Id: If37119510cbc091dc86cb5f699984186167745c7 (cherry picked from commit 7dc905d1e9b561bb71f58881190bb6f590d09d80)
author: Ashod Nakashian <ashod.nakashian@collabora.co.uk> 2018-04-08 23:38:55 -0400
committer: Jan Holesovsky <kendy@collabora.com> 2018-06-06 12:48:26 +0200
commit: 3fdaadbf8c63c68a1de0043e3f0ea9e336d0470e (patch)
tree: 0a159210f020544ed465c660e309aba58e0341b7
parent: 9777681285c29accf041e771692687b0cc0ea4b7 (diff)
3 files changed, 236 insertions, 1 deletions
diff --git a/external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2 b/external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2
new file mode 100644
index 000000000000..ab5564a87353
--- /dev/null
+++ b/external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2
@@ -0,0 +1,173 @@
+From 5f83d0a3fac4f8ccef457c03b74433ffd7b12e2a Mon Sep 17 00:00:00 2001
+From: Ashod Nakashian <ashod.nakashian@collabora.co.uk>
+Date: Tue, 5 Jun 2018 11:28:30 +0200
+Subject: [PATCH 02/14] svx: more accurate PDF text importing
+
+---
+ pdfium/fpdfsdk/fpdf_editpage.cpp | 84 ++++++++++++++++++++++++++++++++++++++++
+ pdfium/public/fpdf_edit.h        | 36 +++++++++++++++++
+ 2 files changed, 120 insertions(+)
+
+diff --git a/pdfium/fpdfsdk/fpdf_editpage.cpp b/pdfium/fpdfsdk/fpdf_editpage.cpp
+index 912df63..3244943 100644
+--- a/pdfium/fpdfsdk/fpdf_editpage.cpp
++++ b/pdfium/fpdfsdk/fpdf_editpage.cpp
+@@ -12,6 +12,7 @@
+ #include <vector>
+ 
+ #include "core/fpdfapi/edit/cpdf_pagecontentgenerator.h"
++#include "core/fpdfapi/font/cpdf_font.h"
+ #include "core/fpdfapi/page/cpdf_form.h"
+ #include "core/fpdfapi/page/cpdf_formobject.h"
+ #include "core/fpdfapi/page/cpdf_imageobject.h"
+@@ -626,6 +627,26 @@ FPDFPageObj_SetLineCap(FPDF_PAGEOBJECT page_object, int line_cap) {
+   return true;
+ }
+ 
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object)
++{
++  if (!text_object)
++    return 0;
++
++  CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++  return pTxtObj->CountChars();
++}
++
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object)
++{
++  if (!text_object)
++    return 0;
++
++  CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++  return pTxtObj->GetFontSize();
++}
++
+ FPDF_EXPORT void FPDF_CALLCONV
+ FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object,
+                       double* a,
+@@ -642,3 +663,66 @@ FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object,
+   *c = matrix.c;
+   *d = matrix.d;
+ }
++
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetUnicode(FPDF_PAGEOBJECT text_object, int index)
++{
++  if (!text_object || index < 0)
++    return 0;
++
++  CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++  if (index > pTxtObj->CountChars())
++    return 0;
++
++  CPDF_TextObjectItem info;
++  pTxtObj->GetCharInfo(index, &info);
++  return info.m_CharCode;
++}
++
++FPDF_EXPORT int FPDF_CALLCONV FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
++                                                  int char_start,
++                                                  int char_count,
++                                                  unsigned short* result) {
++  if (!text_object || char_start < 0 || char_count < 0 || !result)
++    return 0;
++
++  CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++  int char_available = pTxtObj->CountChars() - char_start;
++  if (char_available <= 0)
++    return 0;
++
++  char_count = std::min(char_count, char_available);
++  if (char_count == 0) {
++    // Writing out "", which has a character count of 1 due to the NUL.
++    *result = '\0';
++    return 1;
++  }
++
++  CPDF_Font* pFont = pTxtObj->GetFont();
++  WideString str;
++  for (uint32_t charcode : pTxtObj->GetCharCodes()) {
++    if (charcode != CPDF_Font::kInvalidCharCode)
++      str += pFont->UnicodeFromCharCode(charcode);
++  }
++
++//   CFX_WideTextBuf m_TextBuf;
++//   WideString str = textpage->GetPageText(char_start, char_count);
++//   return WideString(m_TextBuf.AsStringView().Mid(
++//       static_cast<size_t>(text_start), static_cast<size_t>(text_count)));
++
++//   if (str.GetLength() > static_cast<size_t>(char_count))
++//     str = str.Left(static_cast<size_t>(char_count));
++
++  // Reincode in UTF-16.
++//   WideString str = text.UTF8Decode();
++
++  // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
++  // the number of items to stay the same.
++  ByteString byte_str = str.UTF16LE_Encode();
++  size_t byte_str_len = byte_str.GetLength();
++  int ret_count = byte_str_len / sizeof(unsigned short);
++
++  ASSERT(ret_count <= char_count + 1);  // +1 to account for the NUL terminator.
++  memcpy(result, byte_str.GetBuffer(byte_str_len), byte_str_len);
++  return ret_count;
++}
+diff --git a/pdfium/public/fpdf_edit.h b/pdfium/public/fpdf_edit.h
+index 3f45495..602849f 100644
+--- a/pdfium/public/fpdf_edit.h
++++ b/pdfium/public/fpdf_edit.h
+@@ -971,6 +971,26 @@ FPDFPageObj_CreateTextObj(FPDF_DOCUMENT document,
+                           FPDF_FONT font,
+                           float font_size);
+ 
++// Get the number of characters from a text object.
++//
++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
++//               or FPDFPageObj_NewTextObjEx.
++// Return Value:
++// A character count in the text object.
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object);
++
++
++// Get the font size of a text object.
++//
++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
++//               or FPDFPageObj_NewTextObjEx.
++//
++// Return Value:
++// The value of the font size
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object);
++
+ // Get the matrix of a particular text object.
+ //
+ // text_object - Handle of text object returned by FPDFPageObj_NewTextObj
+@@ -986,6 +1006,22 @@ FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object,
+                       double* c,
+                       double* d);
+ 
++// Get the unicode of a special character in a text object.
++//
++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
++//               or FPDFPageObj_NewTextObjEx.
++// index - The index of the character to get the unicode.
++// Return Value:
++// The unicode value.
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetUnicode(FPDF_PAGEOBJECT text_object, int index);
++
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
++                    int char_start,
++                    int char_count,
++                    unsigned short* result);
++
+ #ifdef __cplusplus
+ }  // extern "C"
+ #endif  // __cplusplus
+-- 
+2.16.3
+
diff --git a/external/pdfium/UnpackedTarball_pdfium.mk b/external/pdfium/UnpackedTarball_pdfium.mk
index 58c014f41252..22e762695300 100644
--- a/external/pdfium/UnpackedTarball_pdfium.mk
+++ b/external/pdfium/UnpackedTarball_pdfium.mk
@@ -15,6 +15,7 @@ pdfium_patches += icu.patch.1
 pdfium_patches += build.patch.1
 # Adds missing editing API
 pdfium_patches += 0001-svx-import-PDF-text-using-PDFium.patch.2
+pdfium_patches += 0002-svx-more-accurate-PDF-text-importing.patch.2
 
 $(eval $(call gb_UnpackedTarball_UnpackedTarball,pdfium))
 
diff --git a/svx/source/svdraw/svdpdf.cxx b/svx/source/svdraw/svdpdf.cxx
index 89f89d0e7050..602e630563e8 100644
--- a/svx/source/svdraw/svdpdf.cxx
+++ b/svx/source/svdraw/svdpdf.cxx
@@ -236,8 +236,67 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc
             switch (nPageObjectType)
             {
                 case FPDF_PAGEOBJ_TEXT:
+                {
                     SAL_WARN("sd.filter", "Got page object TEXT");
-                    break;
+                    float left;
+                    float bottom;
+                    float right;
+                    float top;
+                    if (!FPDFPageObj_GetBounds(pPageObject, &left, &bottom, &right, &top))
+                    {
+                        SAL_WARN("sd.filter", "FAILED to get TEXT bounds");
+                    }
+
+                    SAL_WARN("sd.filter", "Got TEXT bounds left: " << left << ", right: " << right
+                                                                   << ", top: " << top
+                                                                   << ", bottom: " << bottom);
+                    Rectangle aRect = PointsToLogic(left, right, top, bottom);
+
+                    double dFontScale = 1.0;
+                    geometry::Matrix2D aMatrix;
+                    FPDFTextObj_GetMatrix(pPageObject, &aMatrix.m00, &aMatrix.m01, &aMatrix.m10,
+                                          &aMatrix.m11);
+                    if (aMatrix.m00 != aMatrix.m11 || aMatrix.m00 <= 0)
+                    {
+                        SAL_WARN("sd.filter", "Bogus font scale matrix ("
+                                                  << aMatrix.m00 << ',' << aMatrix.m11
+                                                  << "), will use heuristic height of "
+                                                  << aRect.GetHeight() << ".");
+                        dFontScale = aRect.GetHeight();
+                    }
+                    else
+                        dFontScale = aMatrix.m00;
+
+                    double dFontSize = FPDFTextObj_GetFontSize(pPageObject);
+                    SAL_WARN("sd.filter", "Got Font Size: " << dFontSize);
+                    dFontSize *= dFontScale;
+                    SAL_WARN("sd.filter", "Got Font Size Scaled: " << dFontSize);
+                    dFontSize = lcl_PointToPixel(dFontSize);
+                    SAL_WARN("sd.filter", "Got Font Pixel Size: " << dFontSize);
+                    dFontSize = lcl_ToLogic(dFontSize);
+                    SAL_WARN("sd.filter", "Got Font Logic Size: " << dFontSize);
+                    vcl::Font aFnt = mpVD->GetFont();
+                    aFnt.SetFontSize(Size(dFontSize, dFontSize));
+                    mpVD->SetFont(aFnt);
+
+                    const int nChars = FPDFTextObj_CountChars(pPageObject);
+                    std::unique_ptr<sal_Unicode[]> pText(
+                        new sal_Unicode[nChars + 1]); // + terminating null
+
+                    unsigned short* pShortText = reinterpret_cast<unsigned short*>(pText.get());
+                    const int nActualChars
+                        = FPDFTextObj_GetText(pPageObject, 0, nChars, pShortText);
+                    OUString sText(pText.get(), nActualChars);
+
+                    // for (int nChar = 0; nChar < nChars; ++nChar)
+                    //     pText[nChar] = static_cast<sal_Unicode>(FPDFTextObj_GetUnicode(pPageObject, nChar));
+                    // OUString sText(pText.get(), nChars);
+                    SAL_WARN("sd.filter", "Got Text #" << nPageObjectIndex + 1 << " (" << nChars
+                                                       << "): [" << sText << "].");
+
+                    ImportText(aRect.TopLeft(), sText);
+                }
+                break;
                 case FPDF_PAGEOBJ_PATH:
                     SAL_WARN("sd.filter", "Got page object PATH");
                     break;
@@ -256,6 +315,7 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc
             }
         }
 
+#if 0
         // Now do the text.
         FPDF_TEXTPAGE pTextPage = FPDFText_LoadPage(pPdfPage);
         if (pTextPage != nullptr)
@@ -406,6 +466,7 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc
 
             FPDFText_ClosePage(pTextPage);
         }
+#endif
 
         FPDF_ClosePage(pPdfPage);
     }
author	Ashod Nakashian <ashod.nakashian@collabora.co.uk>	2018-04-08 23:38:55 -0400
committer	Jan Holesovsky <kendy@collabora.com>	2018-06-06 12:48:26 +0200
commit	3fdaadbf8c63c68a1de0043e3f0ea9e336d0470e (patch)
tree	0a159210f020544ed465c660e309aba58e0341b7
parent	9777681285c29accf041e771692687b0cc0ea4b7 (diff)