diff options
author | Khaled Hosny <khaledhosny@eglug.org> | 2018-04-26 12:55:26 +0200 |
---|---|---|
committer | Miklos Vajna <vmiklos@collabora.co.uk> | 2018-04-27 11:23:14 +0200 |
commit | c688b01d9102832226251fc84045408afe392459 (patch) | |
tree | e000d416369c3d4b032cf2614ce8e9d59eb0e68f | |
parent | dfdc165a48d711b867961d1f75ee36a1c9596dc0 (diff) |
tdf#66597 Fix PDF text extraction for complex text
Implement a more through strategy for embedding textual content in PDF
files:
* If there is unique one to one or one to many mapping between each
glyph index and Unicode code points, use ToUnicode CMAP.
* If there is many to one or many to many mapping, use an ActualText
span embedding the original string, since ToUnicode can’t handle
these.
* If the one glyph is used for several Unicode code points, also use
ActualText since ToUnicode can map each glyph in the font only once.
* Limit ActualText to single cluster at a time, since using it for whole
words or sentences breaks text selection and highlighting in PDF
viewers (there will be no way to tell which glyphs belong to which
characters).
* Keep generating (now) redundant ToUnicode entries for compatibility
with old tools not supporting ActualText.
Change-Id: I33261811b59b3b8fe2164c2c21d3c52c417e6208
Reviewed-on: https://gerrit.libreoffice.org/53315
Tested-by: Jenkins <ci@libreoffice.org>
Reviewed-by: Miklos Vajna <vmiklos@collabora.co.uk>
-rw-r--r-- | vcl/inc/sallayout.hxx | 4 | ||||
-rw-r--r-- | vcl/qa/cppunit/pdfexport/data/tdf66597-1.odt | bin | 0 -> 8154 bytes | |||
-rw-r--r-- | vcl/qa/cppunit/pdfexport/data/tdf66597-2.odt | bin | 0 -> 8265 bytes | |||
-rw-r--r-- | vcl/qa/cppunit/pdfexport/pdfexport.cxx | 196 | ||||
-rw-r--r-- | vcl/source/gdi/CommonSalLayout.cxx | 10 | ||||
-rw-r--r-- | vcl/source/gdi/pdfwriter_impl.cxx | 114 | ||||
-rw-r--r-- | vcl/source/gdi/pdfwriter_impl.hxx | 14 |
7 files changed, 320 insertions, 18 deletions
diff --git a/vcl/inc/sallayout.hxx b/vcl/inc/sallayout.hxx index 170c2a2c380d..e07d51e8b3e8 100644 --- a/vcl/inc/sallayout.hxx +++ b/vcl/inc/sallayout.hxx @@ -287,7 +287,8 @@ public: IS_VERTICAL = 0x008, IS_SPACING = 0x010, ALLOW_KASHIDA = 0x020, - IS_DROPPED = 0x040 + IS_DROPPED = 0x040, + IS_CLUSTER_START = 0x080 }; bool IsInCluster() const { return ((mnFlags & IS_IN_CLUSTER) != 0); } @@ -297,6 +298,7 @@ public: bool IsSpacing() const { return ((mnFlags & IS_SPACING) != 0); } bool AllowKashida() const { return ((mnFlags & ALLOW_KASHIDA) != 0); } bool IsDropped() const { return ((mnFlags & IS_DROPPED) != 0); } + bool IsClusterStart() const { return ((mnFlags & IS_CLUSTER_START) != 0); } }; class VCL_PLUGIN_PUBLIC GenericSalLayout : public SalLayout diff --git a/vcl/qa/cppunit/pdfexport/data/tdf66597-1.odt b/vcl/qa/cppunit/pdfexport/data/tdf66597-1.odt Binary files differnew file mode 100644 index 000000000000..7fecc55c6386 --- /dev/null +++ b/vcl/qa/cppunit/pdfexport/data/tdf66597-1.odt diff --git a/vcl/qa/cppunit/pdfexport/data/tdf66597-2.odt b/vcl/qa/cppunit/pdfexport/data/tdf66597-2.odt Binary files differnew file mode 100644 index 000000000000..3d7b5e59cc9d --- /dev/null +++ b/vcl/qa/cppunit/pdfexport/data/tdf66597-2.odt diff --git a/vcl/qa/cppunit/pdfexport/pdfexport.cxx b/vcl/qa/cppunit/pdfexport/pdfexport.cxx index b9fe20df099f..d280f561fc64 100644 --- a/vcl/qa/cppunit/pdfexport/pdfexport.cxx +++ b/vcl/qa/cppunit/pdfexport/pdfexport.cxx @@ -75,8 +75,12 @@ public: void testTdf115117_1a(); /// Test writing ToUnicode CMAP for RTL ligatures. void testTdf115117_2(); - /// Text extracting RTL text with ligatures. + /// Test extracting RTL text with ligatures. void testTdf115117_2a(); + /// Test writing ToUnicode CMAP for doubly encoded glyphs. + void testTdf66597_1(); + /// Test writing ActualText for many to one glyph to Unicode mapping. + void testTdf66597_2(); #endif #endif @@ -101,6 +105,8 @@ public: CPPUNIT_TEST(testTdf115117_1a); CPPUNIT_TEST(testTdf115117_2); CPPUNIT_TEST(testTdf115117_2a); + CPPUNIT_TEST(testTdf66597_1); + CPPUNIT_TEST(testTdf66597_2); #endif #endif CPPUNIT_TEST_SUITE_END(); @@ -976,6 +982,194 @@ void PdfExportTest::testTdf115117_2a() OUString aActualText(aChars.data(), aChars.size()); CPPUNIT_ASSERT_EQUAL(aExpectedText, aActualText); } + +// This requires Amiri font, if it is missing the test will fail. +void PdfExportTest::testTdf66597_1() +{ + // FIXME: Fallback font is used on Windows for some reason. +#if !defined _WIN32 + vcl::filter::PDFDocument aDocument; + load("tdf66597-1.odt", aDocument); + + { + // Get access to ToUnicode of the first font + vcl::filter::PDFObjectElement* pToUnicode = nullptr; + for (const auto& aElement : aDocument.GetElements()) + { + auto pObject = dynamic_cast<vcl::filter::PDFObjectElement*>(aElement.get()); + if (!pObject) + continue; + auto pType = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("Type")); + if (pType && pType->GetValue() == "Font") + { + auto pName = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("BaseFont")); + auto aName = pName->GetValue().copy(7); // skip the subset id + CPPUNIT_ASSERT_EQUAL_MESSAGE("Unexpected font name", OString("Amiri-Regular"), aName); + + auto pToUnicodeRef = dynamic_cast<vcl::filter::PDFReferenceElement*>(pObject->Lookup("ToUnicode")); + CPPUNIT_ASSERT(pToUnicodeRef); + pToUnicode = pToUnicodeRef->LookupObject(); + break; + } + } + + CPPUNIT_ASSERT(pToUnicode); + auto pStream = pToUnicode->GetStream(); + CPPUNIT_ASSERT(pStream); + SvMemoryStream aObjectStream; + ZCodec aZCodec; + aZCodec.BeginCompression(); + pStream->GetMemory().Seek(0); + aZCodec.Decompress(pStream->GetMemory(), aObjectStream); + CPPUNIT_ASSERT(aZCodec.EndCompression()); + aObjectStream.Seek(0); + // The <01> is glyph id, <0020> is code point. + // The document has three characters <space><nbspace><space>, but the font + // reuses the same glyph for space and nbspace so we should have a single + // CMAP entry for the space, and nbspace will be handled with ActualText + // (tested above). + std::string aCmap("1 beginbfchar\n" + "<01> <0020>\n" + "endbfchar"); + std::string aData(static_cast<const char*>(aObjectStream.GetData()), aObjectStream.GetSize()); + auto nPos = aData.find(aCmap); + CPPUNIT_ASSERT(nPos != std::string::npos); + } + + { + auto aPages = aDocument.GetPages(); + CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), aPages.size()); + // Get page contents and stream. + auto pContents = aPages[0]->LookupObject("Contents"); + CPPUNIT_ASSERT(pContents); + auto pStream = pContents->GetStream(); + CPPUNIT_ASSERT(pStream); + auto& rObjectStream = pStream->GetMemory(); + + // Uncompress the stream. + SvMemoryStream aUncompressed; + ZCodec aZCodec; + aZCodec.BeginCompression(); + rObjectStream.Seek(0); + aZCodec.Decompress(rObjectStream, aUncompressed); + CPPUNIT_ASSERT(aZCodec.EndCompression()); + + // Make sure the expected ActualText is present. + std::string aData(static_cast<const char*>(aUncompressed.GetData()), aUncompressed.GetSize()); + + std::string aActualText("/Span<</ActualText<"); + size_t nCount = 0; + size_t nPos = 0; + while ((nPos = aData.find(aActualText, nPos)) != std::string::npos) + { + nCount++; + nPos += aActualText.length(); + } + CPPUNIT_ASSERT_EQUAL_MESSAGE("The should be one ActualText entry!", static_cast<size_t>(1), nCount); + + aActualText = "/Span<</ActualText<FEFF00A0>>>"; + nPos = aData.find(aActualText); + CPPUNIT_ASSERT_MESSAGE("ActualText not found!", nPos != std::string::npos); + } +#endif +} + +// This requires Reem Kufi font, if it is missing the test will fail. +void PdfExportTest::testTdf66597_2() +{ + // FIXME: Fallback font is used on Windows for some reason. +#if !defined _WIN32 + vcl::filter::PDFDocument aDocument; + load("tdf66597-2.odt", aDocument); + + { + // Get access to ToUnicode of the first font + vcl::filter::PDFObjectElement* pToUnicode = nullptr; + for (const auto& aElement : aDocument.GetElements()) + { + auto pObject = dynamic_cast<vcl::filter::PDFObjectElement*>(aElement.get()); + if (!pObject) + continue; + auto pType = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("Type")); + if (pType && pType->GetValue() == "Font") + { + auto pName = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("BaseFont")); + auto aName = pName->GetValue().copy(7); // skip the subset id + CPPUNIT_ASSERT_EQUAL_MESSAGE("Unexpected font name", OString("ReemKufi-Regular"), aName); + + auto pToUnicodeRef = dynamic_cast<vcl::filter::PDFReferenceElement*>(pObject->Lookup("ToUnicode")); + CPPUNIT_ASSERT(pToUnicodeRef); + pToUnicode = pToUnicodeRef->LookupObject(); + break; + } + } + + CPPUNIT_ASSERT(pToUnicode); + auto pStream = pToUnicode->GetStream(); + CPPUNIT_ASSERT(pStream); + SvMemoryStream aObjectStream; + ZCodec aZCodec; + aZCodec.BeginCompression(); + pStream->GetMemory().Seek(0); + aZCodec.Decompress(pStream->GetMemory(), aObjectStream); + CPPUNIT_ASSERT(aZCodec.EndCompression()); + aObjectStream.Seek(0); + std::string aCmap("8 beginbfchar\n" + "<02> <0632>\n" + "<03> <0020>\n" + "<04> <0648>\n" + "<05> <0647>\n" + "<06> <062F>\n" + "<08> <062C>\n" + "<09> <0628>\n" + "<0B> <0623>\n" + "endbfchar"); + std::string aData(static_cast<const char*>(aObjectStream.GetData()), aObjectStream.GetSize()); + auto nPos = aData.find(aCmap); + CPPUNIT_ASSERT(nPos != std::string::npos); + } + + { + auto aPages = aDocument.GetPages(); + CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), aPages.size()); + // Get page contents and stream. + auto pContents = aPages[0]->LookupObject("Contents"); + CPPUNIT_ASSERT(pContents); + auto pStream = pContents->GetStream(); + CPPUNIT_ASSERT(pStream); + auto& rObjectStream = pStream->GetMemory(); + + // Uncompress the stream. + SvMemoryStream aUncompressed; + ZCodec aZCodec; + aZCodec.BeginCompression(); + rObjectStream.Seek(0); + aZCodec.Decompress(rObjectStream, aUncompressed); + CPPUNIT_ASSERT(aZCodec.EndCompression()); + + // Make sure the expected ActualText is present. + std::string aData(static_cast<const char*>(aUncompressed.GetData()), aUncompressed.GetSize()); + + std::vector<std::string> aCodes({ "0632", "062C", "0628", "0623" }); + std::string aActualText("/Span<</ActualText<"); + size_t nCount = 0; + size_t nPos = 0; + while ((nPos = aData.find(aActualText, nPos)) != std::string::npos) + { + nCount++; + nPos += aActualText.length(); + } + CPPUNIT_ASSERT_EQUAL_MESSAGE("Number of ActualText entries does not match!", aCodes.size(), nCount); + + for (const auto& aCode : aCodes) + { + aActualText = "/Span<</ActualText<FEFF" + aCode + ">>>"; + nPos = aData.find(aActualText); + CPPUNIT_ASSERT_MESSAGE("ActualText not found for " + aCode, nPos != std::string::npos); + } + } +#endif +} #endif #endif diff --git a/vcl/source/gdi/CommonSalLayout.cxx b/vcl/source/gdi/CommonSalLayout.cxx index 7404984f4620..c430bda597fb 100644 --- a/vcl/source/gdi/CommonSalLayout.cxx +++ b/vcl/source/gdi/CommonSalLayout.cxx @@ -696,6 +696,7 @@ bool CommonSalLayout::LayoutText(ImplLayoutArgs& rArgs) int32_t nCharPos = pHbGlyphInfos[i].cluster; int32_t nCharCount = 0; bool bInCluster = false; + bool bClusterStart = false; // Find the number of characters that make up this glyph. if (!bRightToLeft) @@ -719,6 +720,9 @@ bool CommonSalLayout::LayoutText(ImplLayoutArgs& rArgs) if (nNextCharPos == nCharPos) nNextCharPos = nEndRunPos; nCharCount = nNextCharPos - nCharPos; + if ((i == 0 || pHbGlyphInfos[i].cluster != pHbGlyphInfos[i - 1].cluster) && + (i > nRunGlyphCount - 1 && pHbGlyphInfos[i].cluster == pHbGlyphInfos[i + 1].cluster)) + bClusterStart = true; } } else @@ -742,6 +746,9 @@ bool CommonSalLayout::LayoutText(ImplLayoutArgs& rArgs) if (nNextCharPos == nCharPos) nNextCharPos = nEndRunPos; nCharCount = nNextCharPos - nCharPos; + if ((i == nRunGlyphCount - 1 || pHbGlyphInfos[i].cluster != pHbGlyphInfos[i + 1].cluster) && + (i > 0 && pHbGlyphInfos[i].cluster == pHbGlyphInfos[i - 1].cluster)) + bClusterStart = true; } } @@ -757,6 +764,9 @@ bool CommonSalLayout::LayoutText(ImplLayoutArgs& rArgs) if (bRightToLeft) nGlyphFlags |= GlyphItem::IS_RTL_GLYPH; + if (bClusterStart) + nGlyphFlags |= GlyphItem::IS_CLUSTER_START; + if (bInCluster) nGlyphFlags |= GlyphItem::IS_IN_CLUSTER; diff --git a/vcl/source/gdi/pdfwriter_impl.cxx b/vcl/source/gdi/pdfwriter_impl.cxx index de4872342db7..a61d26235976 100644 --- a/vcl/source/gdi/pdfwriter_impl.cxx +++ b/vcl/source/gdi/pdfwriter_impl.cxx @@ -6431,6 +6431,7 @@ void PDFWriterImpl::drawHorizontalGlyphs( const std::vector<PDFWriterImpl::PDFGlyph>& rGlyphs, OStringBuffer& rLine, const Point& rAlignOffset, + bool bFirst, double fAngle, double fXScale, double fSkew, @@ -6470,7 +6471,7 @@ void PDFWriterImpl::drawHorizontalGlyphs( // the textline matrix relative to what was set before // making use of that would drive us into rounding issues Matrix3 aMat; - if( nRun == 0 && fAngle == 0.0 && fXScale == 1.0 && fSkew == 0.0 ) + if( bFirst && nRun == 0 && fAngle == 0.0 && fXScale == 1.0 && fSkew == 0.0 ) { m_aPages.back().appendPoint( aCurPos, rLine ); rLine.append( " Td " ); @@ -6679,14 +6680,55 @@ void PDFWriterImpl::drawLayout( SalLayout& rLayout, const OUString& rText, bool aCodeUnits.clear(); - // try to handle ligatures and such - int nStart = pGlyph->mnCharPos; - int nChars = pGlyph->mnCharCount; - if (nChars < 0) - nChars = 0; + // tdf#66597, tdf#115117 + // + // Here is how we embed textual content in PDF files, to allow for + // better text extraction for complex and typography-rich text. + // + // * If there is many to one or many to many mapping, use an + // ActualText span embedding the original string, since ToUnicode + // can’t handle these. + // * If the one glyph is used for several Unicode code points, also + // use ActualText since ToUnicode can map each glyph in the font + // only once. + // * Limit ActualText to single cluster at a time, since using it + // for whole words or sentences breaks text selection and + // highlighting in PDF viewers (there will be no way to tell + // which glyphs belong to which characters). + // * Keep generating (now) redundant ToUnicode entries for + // compatibility with old tools not supporting ActualText. + + assert(pGlyph->mnCharCount >= 0); + for (int n = 0; n < pGlyph->mnCharCount; n++) + aCodeUnits.push_back(rText[pGlyph->mnCharPos + n]); + + bool bUseActualText = false; + + // If this is a start of complex cluster, use ActualText. + if (pGlyph->IsClusterStart()) + bUseActualText = true; + + // Or part of a complex cluster, will be handled by the ActualText + // of its cluster start. + if (pGlyph->IsInCluster()) + assert(aCodeUnits.empty()); + + // A glyph can’t have more than one ToUnicode entry, use ActualText + // instead. + if (!aCodeUnits.empty() && !bUseActualText) + { + for (const auto& rSubset : m_aSubsets[pFont].m_aSubsets) + { + const auto& it = rSubset.m_aMapping.find(pGlyph->maGlyphId); + if (it != rSubset.m_aMapping.cend() && it->second.codes() != aCodeUnits) + { + bUseActualText = true; + aCodeUnits.clear(); + } + } + } - for (int n = 0; n < nChars; n++) - aCodeUnits.push_back(rText[nStart + n]); + assert(!aCodeUnits.empty() || bUseActualText || pGlyph->IsInCluster()); sal_uInt8 nMappedGlyph; sal_Int32 nMappedFontObject; @@ -6703,12 +6745,19 @@ void PDFWriterImpl::drawLayout( SalLayout& rLayout, const OUString& rText, bool pGraphics); } + int nCharPos = -1; + if (bUseActualText || pGlyph->IsInCluster()) + nCharPos = pGlyph->mnCharPos; + aGlyphs.emplace_back(aPos, nGlyphWidth, pGlyph->maGlyphId, nMappedFontObject, nMappedGlyph, - pGlyph->IsVertical()); + pGlyph->IsVertical(), + pGlyph->IsRTLGlyph(), + nCharPos, + pGlyph->mnCharCount); } // Avoid fill color when map mode is in pixels, the below code assumes @@ -6760,10 +6809,49 @@ void PDFWriterImpl::drawLayout( SalLayout& rLayout, const OUString& rText, bool */ if( ! aGlyphs.empty() ) { - if( bVertical ) - drawVerticalGlyphs( aGlyphs, aLine, aAlignOffset, aRotScale, fAngle, fXScale, fSkew, nFontHeight ); - else - drawHorizontalGlyphs( aGlyphs, aLine, aAlignOffset, fAngle, fXScale, fSkew, nFontHeight, nPixelFontHeight ); + size_t nStart = 0; + size_t nEnd = 0; + while (nStart < aGlyphs.size()) + { + while (nEnd < aGlyphs.size() && aGlyphs[nEnd].m_nCharPos == aGlyphs[nStart].m_nCharPos) + nEnd++; + + std::vector<PDFGlyph> aRun(aGlyphs.begin() + nStart, aGlyphs.begin() + nEnd); + + int nCharPos, nCharCount; + if (!aRun.front().m_bRTL) + { + nCharPos = aRun.front().m_nCharPos; + nCharCount = aRun.front().m_nCharCount; + } + else + { + nCharPos = aRun.back().m_nCharPos; + nCharCount = aRun.back().m_nCharCount; + } + + if (nCharPos >= 0 && nCharCount) + { + aLine.append("/Span<</ActualText<FEFF"); + for (int i = 0; i < nCharCount; i++) + { + sal_Unicode aChar = rText[nCharPos + i]; + appendHex(static_cast<sal_Int8>(aChar >> 8), aLine); + appendHex(static_cast<sal_Int8>(aChar & 255), aLine); + } + aLine.append( ">>>\nBDC\n" ); + } + + if (bVertical) + drawVerticalGlyphs(aRun, aLine, aAlignOffset, aRotScale, fAngle, fXScale, fSkew, nFontHeight); + else + drawHorizontalGlyphs(aRun, aLine, aAlignOffset, nStart == 0, fAngle, fXScale, fSkew, nFontHeight, nPixelFontHeight); + + if (nCharPos >= 0 && nCharCount) + aLine.append( "EMC\n" ); + + nStart = nEnd; + } } // end textobject diff --git a/vcl/source/gdi/pdfwriter_impl.hxx b/vcl/source/gdi/pdfwriter_impl.hxx index 29e41d0f23e1..6635e083556f 100644 --- a/vcl/source/gdi/pdfwriter_impl.hxx +++ b/vcl/source/gdi/pdfwriter_impl.hxx @@ -328,6 +328,7 @@ public: m_CodeUnits.push_back(i_cCode); } sal_Int32 countCodes() const { return m_CodeUnits.size(); } + const std::vector<sal_Ucs>& codes() const { return m_CodeUnits; } sal_Ucs getCode( sal_Int32 i_nIndex ) const { sal_Ucs nRet = 0; @@ -591,16 +592,23 @@ public: sal_Int32 m_nMappedFontId; sal_uInt8 m_nMappedGlyphId; bool m_bVertical; + bool m_bRTL; + int m_nCharPos; + int m_nCharCount; PDFGlyph( const Point& rPos, sal_Int32 nNativeWidth, sal_Int32 nGlyphId, sal_Int32 nFontId, sal_uInt8 nMappedGlyphId, - bool bVertical ) + bool bVertical, + bool bRTL, + int nCharPos, + int nCharCount ) : m_aPos( rPos ), m_nNativeWidth( nNativeWidth ), m_nGlyphId( nGlyphId ), m_nMappedFontId( nFontId ), m_nMappedGlyphId( nMappedGlyphId ), - m_bVertical(bVertical) + m_bVertical(bVertical), m_bRTL(bRTL), + m_nCharPos(nCharPos), m_nCharCount(nCharCount) {} }; @@ -815,7 +823,7 @@ i12626 /* emits a text object according to the passed layout */ /* TODO: remove rText as soon as SalLayout will change so that rText is not necessary anymore */ void drawVerticalGlyphs( const std::vector<PDFGlyph>& rGlyphs, OStringBuffer& rLine, const Point& rAlignOffset, const Matrix3& rRotScale, double fAngle, double fXScale, double fSkew, sal_Int32 nFontHeight ); - void drawHorizontalGlyphs( const std::vector<PDFGlyph>& rGlyphs, OStringBuffer& rLine, const Point& rAlignOffset, double fAngle, double fXScale, double fSkew, sal_Int32 nFontHeight, sal_Int32 nPixelFontHeight ); + void drawHorizontalGlyphs( const std::vector<PDFGlyph>& rGlyphs, OStringBuffer& rLine, const Point& rAlignOffset, bool bFirst, double fAngle, double fXScale, double fSkew, sal_Int32 nFontHeight, sal_Int32 nPixelFontHeight ); void drawLayout( SalLayout& rLayout, const OUString& rText, bool bTextLines ); void drawRelief( SalLayout& rLayout, const OUString& rText, bool bTextLines ); void drawShadow( SalLayout& rLayout, const OUString& rText, bool bTextLines ); |