summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAshod Nakashian <ashod.nakashian@collabora.co.uk>2018-04-08 23:38:55 -0400
committerJan Holesovsky <kendy@collabora.com>2018-05-22 12:17:12 +0200
commit7dc905d1e9b561bb71f58881190bb6f590d09d80 (patch)
tree68de1d3fa39467d6ae98578cbb20b83cdff1de42
parent07cd74846d41779b5a315a081cdfb1db6b5e18a3 (diff)
svx: more accurate PDF text importing
Change-Id: If37119510cbc091dc86cb5f699984186167745c7
-rw-r--r--external/pdfium/edit.patch.1136
-rw-r--r--svx/source/svdraw/svdpdf.cxx63
2 files changed, 193 insertions, 6 deletions
diff --git a/external/pdfium/edit.patch.1 b/external/pdfium/edit.patch.1
index 78cf4c3394c6..b7cd86e5ff2d 100644
--- a/external/pdfium/edit.patch.1
+++ b/external/pdfium/edit.patch.1
@@ -35,10 +35,17 @@ index 0a01ae0..fad2920 100644
DrawTextPathWithPattern(textobj, pObj2Device, pFont, font_size,
&text_matrix, bFill, bStroke);
diff --git a/fpdfsdk/fpdfeditpage.cpp b/fpdfsdk/fpdfeditpage.cpp
-index ca2cf3f..ef4b958 100644
+index ca2cf3f..ac36788 100644
--- a/fpdfsdk/fpdfeditpage.cpp
+++ b/fpdfsdk/fpdfeditpage.cpp
-@@ -17,6 +17,7 @@
+@@ -11,12 +11,14 @@
+ #include <utility>
+
+ #include "core/fpdfapi/edit/cpdf_pagecontentgenerator.h"
++#include "core/fpdfapi/font/cpdf_font.h"
+ #include "core/fpdfapi/page/cpdf_form.h"
+ #include "core/fpdfapi/page/cpdf_formobject.h"
+ #include "core/fpdfapi/page/cpdf_imageobject.h"
#include "core/fpdfapi/page/cpdf_page.h"
#include "core/fpdfapi/page/cpdf_pageobject.h"
#include "core/fpdfapi/page/cpdf_pathobject.h"
@@ -46,11 +53,31 @@ index ca2cf3f..ef4b958 100644
#include "core/fpdfapi/page/cpdf_shadingobject.h"
#include "core/fpdfapi/parser/cpdf_array.h"
#include "core/fpdfapi/parser/cpdf_document.h"
-@@ -363,3 +364,20 @@ FPDFPageObj_GetBounds(FPDF_PAGEOBJECT pageObject,
+@@ -363,3 +365,103 @@ FPDFPageObj_GetBounds(FPDF_PAGEOBJECT pageObject,
*top = bbox.top;
return true;
}
+
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object)
++{
++ if (!text_object)
++ return 0;
++
++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++ return pTxtObj->CountChars();
++}
++
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object)
++{
++ if (!text_object)
++ return 0;
++
++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++ return pTxtObj->GetFontSize();
++}
++
+FPDF_EXPORT void FPDF_CALLCONV
+FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object,
+ double* a,
@@ -67,6 +94,69 @@ index ca2cf3f..ef4b958 100644
+ *c = matrix.c;
+ *d = matrix.d;
+}
++
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetUnicode(FPDF_PAGEOBJECT text_object, int index)
++{
++ if (!text_object || index < 0)
++ return 0;
++
++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++ if (index > pTxtObj->CountChars())
++ return 0;
++
++ CPDF_TextObjectItem info;
++ pTxtObj->GetCharInfo(index, &info);
++ return info.m_CharCode;
++}
++
++FPDF_EXPORT int FPDF_CALLCONV FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
++ int char_start,
++ int char_count,
++ unsigned short* result) {
++ if (!text_object || char_start < 0 || char_count < 0 || !result)
++ return 0;
++
++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++ int char_available = pTxtObj->CountChars() - char_start;
++ if (char_available <= 0)
++ return 0;
++
++ char_count = std::min(char_count, char_available);
++ if (char_count == 0) {
++ // Writing out "", which has a character count of 1 due to the NUL.
++ *result = '\0';
++ return 1;
++ }
++
++ CPDF_Font* pFont = pTxtObj->GetFont();
++ WideString str;
++ for (uint32_t charcode : pTxtObj->GetCharCodes()) {
++ if (charcode != CPDF_Font::kInvalidCharCode)
++ str += pFont->UnicodeFromCharCode(charcode);
++ }
++
++// CFX_WideTextBuf m_TextBuf;
++// WideString str = textpage->GetPageText(char_start, char_count);
++// return WideString(m_TextBuf.AsStringView().Mid(
++// static_cast<size_t>(text_start), static_cast<size_t>(text_count)));
++
++// if (str.GetLength() > static_cast<size_t>(char_count))
++// str = str.Left(static_cast<size_t>(char_count));
++
++ // Reincode in UTF-16.
++// WideString str = text.UTF8Decode();
++
++ // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
++ // the number of items to stay the same.
++ ByteString byte_str = str.UTF16LE_Encode();
++ size_t byte_str_len = byte_str.GetLength();
++ int ret_count = byte_str_len / sizeof(unsigned short);
++
++ ASSERT(ret_count <= char_count + 1); // +1 to account for the NUL terminator.
++ memcpy(result, byte_str.GetBuffer(byte_str_len), byte_str_len);
++ return ret_count;
++}
diff --git a/fpdfsdk/fpdftext.cpp b/fpdfsdk/fpdftext.cpp
index 68bf4f8..e073b20 100644
--- a/fpdfsdk/fpdftext.cpp
@@ -101,13 +191,33 @@ index 68bf4f8..e073b20 100644
int index,
double* left,
diff --git a/public/fpdf_edit.h b/public/fpdf_edit.h
-index 54735a3..3642a2a 100644
+index 54735a3..a9c1a25 100644
--- a/public/fpdf_edit.h
+++ b/public/fpdf_edit.h
-@@ -761,6 +761,21 @@ FPDFPageObj_CreateTextObj(FPDF_DOCUMENT document,
+@@ -761,6 +761,57 @@ FPDFPageObj_CreateTextObj(FPDF_DOCUMENT document,
FPDF_FONT font,
float font_size);
++// Get the number of characters from a text object.
++//
++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
++// or FPDFPageObj_NewTextObjEx.
++// Return Value:
++// A character count in the text object.
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object);
++
++
++// Get the font size of a text object.
++//
++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
++// or FPDFPageObj_NewTextObjEx.
++//
++// Return Value:
++// The value of the font size
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object);
++
+// Get the matrix of a particular text object.
+//
+// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
@@ -123,6 +233,22 @@ index 54735a3..3642a2a 100644
+ double* c,
+ double* d);
+
++// Get the unicode of a special character in a text object.
++//
++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
++// or FPDFPageObj_NewTextObjEx.
++// index - The index of the character to get the unicode.
++// Return Value:
++// The unicode value.
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetUnicode(FPDF_PAGEOBJECT text_object, int index);
++
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
++ int char_start,
++ int char_count,
++ unsigned short* result);
++
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
diff --git a/svx/source/svdraw/svdpdf.cxx b/svx/source/svdraw/svdpdf.cxx
index 85bb13b86480..ef1cea9683fb 100644
--- a/svx/source/svdraw/svdpdf.cxx
+++ b/svx/source/svdraw/svdpdf.cxx
@@ -233,8 +233,67 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc
switch (nPageObjectType)
{
case FPDF_PAGEOBJ_TEXT:
+ {
SAL_WARN("sd.filter", "Got page object TEXT");
- break;
+ float left;
+ float bottom;
+ float right;
+ float top;
+ if (!FPDFPageObj_GetBounds(pPageObject, &left, &bottom, &right, &top))
+ {
+ SAL_WARN("sd.filter", "FAILED to get TEXT bounds");
+ }
+
+ SAL_WARN("sd.filter", "Got TEXT bounds left: " << left << ", right: " << right
+ << ", top: " << top
+ << ", bottom: " << bottom);
+ Rectangle aRect = PointsToLogic(left, right, top, bottom);
+
+ double dFontScale = 1.0;
+ geometry::Matrix2D aMatrix;
+ FPDFTextObj_GetMatrix(pPageObject, &aMatrix.m00, &aMatrix.m01, &aMatrix.m10,
+ &aMatrix.m11);
+ if (aMatrix.m00 != aMatrix.m11 || aMatrix.m00 <= 0)
+ {
+ SAL_WARN("sd.filter", "Bogus font scale matrix ("
+ << aMatrix.m00 << ',' << aMatrix.m11
+ << "), will use heuristic height of "
+ << aRect.GetHeight() << ".");
+ dFontScale = aRect.GetHeight();
+ }
+ else
+ dFontScale = aMatrix.m00;
+
+ double dFontSize = FPDFTextObj_GetFontSize(pPageObject);
+ SAL_WARN("sd.filter", "Got Font Size: " << dFontSize);
+ dFontSize *= dFontScale;
+ SAL_WARN("sd.filter", "Got Font Size Scaled: " << dFontSize);
+ dFontSize = lcl_PointToPixel(dFontSize);
+ SAL_WARN("sd.filter", "Got Font Pixel Size: " << dFontSize);
+ dFontSize = lcl_ToLogic(dFontSize);
+ SAL_WARN("sd.filter", "Got Font Logic Size: " << dFontSize);
+ vcl::Font aFnt = mpVD->GetFont();
+ aFnt.SetFontSize(Size(dFontSize, dFontSize));
+ mpVD->SetFont(aFnt);
+
+ const int nChars = FPDFTextObj_CountChars(pPageObject);
+ std::unique_ptr<sal_Unicode[]> pText(
+ new sal_Unicode[nChars + 1]); // + terminating null
+
+ unsigned short* pShortText = reinterpret_cast<unsigned short*>(pText.get());
+ const int nActualChars
+ = FPDFTextObj_GetText(pPageObject, 0, nChars, pShortText);
+ OUString sText(pText.get(), nActualChars);
+
+ // for (int nChar = 0; nChar < nChars; ++nChar)
+ // pText[nChar] = static_cast<sal_Unicode>(FPDFTextObj_GetUnicode(pPageObject, nChar));
+ // OUString sText(pText.get(), nChars);
+ SAL_WARN("sd.filter", "Got Text #" << nPageObjectIndex + 1 << " (" << nChars
+ << "): [" << sText << "].");
+
+ ImportText(aRect.TopLeft(), sText);
+ }
+ break;
case FPDF_PAGEOBJ_PATH:
SAL_WARN("sd.filter", "Got page object PATH");
break;
@@ -253,6 +312,7 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc
}
}
+#if 0
// Now do the text.
FPDF_TEXTPAGE pTextPage = FPDFText_LoadPage(pPdfPage);
if (pTextPage != nullptr)
@@ -403,6 +463,7 @@ void ImpSdrPdfImport::DoLoopActions(SvdProgressInfo* pProgrInfo, sal_uInt32* pAc
FPDFText_ClosePage(pTextPage);
}
+#endif
FPDF_ClosePage(pPdfPage);
}