tdf#115117: Fix PDF ToUnicode CMAP for ligatures

Move the glyph to character(s) mapping to CommonSalLayout where we have enough information to do this properly. This correctly handles ligatures at end of run that wasn’t handled before, and also fixes a bug in the PDF writer code when there is more than one ligature in the run (it forgot to clear aCodeUnitsPerGlyph vector after each iteration). Also drop the “temporary” fix for rotated glyph from 2009 that does not seem to be needed now (the document from that bug exports correctly after this change). Change-Id: I5b5b1f4470bbd0ef05cbbc86dfa29d2ff51249ea Reviewed-on: https://gerrit.libreoffice.org/51617 Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Miklos Vajna <vmiklos@collabora.co.uk> (cherry picked from commit b94a66ebc8db6c5ca9c7dcfdfbb06b49deae4939) Reviewed-on: https://gerrit.libreoffice.org/51715
author: Khaled Hosny <khaledhosny@eglug.org> 2018-03-21 16:54:10 +0200
committer: Miklos Vajna <vmiklos@collabora.co.uk> 2018-03-23 09:25:42 +0100
commit: 90fb652ebbc4b16ae5001140076f52209e913345 (patch)
tree: 35238e8dfbd29c6535867acf944fe70822e3e437
parent: deb5db7bf6d45338c9b6f6f4a1d62fba168b7a85 (diff)
6 files changed, 277 insertions, 38 deletions
diff --git a/vcl/inc/sallayout.hxx b/vcl/inc/sallayout.hxx
index ff008c44dd55..c4a29dc9d0c9 100644
--- a/vcl/inc/sallayout.hxx
+++ b/vcl/inc/sallayout.hxx
@@ -255,6 +255,7 @@ struct GlyphItem
 {
     int     mnFlags;
     int     mnCharPos;      // index in string
+    int     mnCharCount;    // number of characters making up this glyph
 
     int     mnOrigWidth;    // original glyph width
     int     mnNewWidth;     // width after adjustments
@@ -270,6 +271,7 @@ public:
                 long nFlags, int nOrigWidth )
             :   mnFlags(nFlags)
             ,   mnCharPos(nCharPos)
+            ,   mnCharCount(1)
             ,   mnOrigWidth(nOrigWidth)
             ,   mnNewWidth(nOrigWidth)
             ,   mnXOffset(0)
@@ -278,10 +280,11 @@ public:
             ,   mnFallbackLevel(0)
             { }
 
-            GlyphItem( int nCharPos, sal_GlyphId aGlyphId, const Point& rLinearPos,
+            GlyphItem(int nCharPos, int nCharCount, sal_GlyphId aGlyphId, const Point& rLinearPos,
                 long nFlags, int nOrigWidth, int nXOffset )
             :   mnFlags(nFlags)
             ,   mnCharPos(nCharPos)
+            ,   mnCharCount(nCharCount)
             ,   mnOrigWidth(nOrigWidth)
             ,   mnNewWidth(nOrigWidth)
             ,   mnXOffset(nXOffset)
diff --git a/vcl/qa/cppunit/pdfexport/data/tdf115117-1.odt b/vcl/qa/cppunit/pdfexport/data/tdf115117-1.odt
new file mode 100644
index 000000000000..63fe82946ef1
--- /dev/null
+++ b/vcl/qa/cppunit/pdfexport/data/tdf115117-1.odt
diff --git a/vcl/qa/cppunit/pdfexport/data/tdf115117-2.odt b/vcl/qa/cppunit/pdfexport/data/tdf115117-2.odt
new file mode 100644
index 000000000000..c1e1f6d4392c
--- /dev/null
+++ b/vcl/qa/cppunit/pdfexport/data/tdf115117-2.odt
diff --git a/vcl/qa/cppunit/pdfexport/pdfexport.cxx b/vcl/qa/cppunit/pdfexport/pdfexport.cxx
index a904a5dc638d..ba8df2f1a616 100644
--- a/vcl/qa/cppunit/pdfexport/pdfexport.cxx
+++ b/vcl/qa/cppunit/pdfexport/pdfexport.cxx
@@ -8,6 +8,7 @@
  */
 
 #include <config_features.h>
+#include <config_test.h>
 
 #include <com/sun/star/frame/Desktop.hpp>
 #include <com/sun/star/frame/XStorable.hpp>
@@ -25,6 +26,7 @@
 #include <tools/zcodec.hxx>
 #if HAVE_FEATURE_PDFIUM
 #include <fpdf_edit.h>
+#include <fpdf_text.h>
 #include <fpdfview.h>
 #endif
 
@@ -67,6 +69,16 @@ public:
     void testTdf99680();
     void testTdf99680_2();
     void testTdf108963();
+#if !TEST_FONTS_MISSING
+    /// Test writing ToUnicode CMAP for LTR ligatures.
+    void testTdf115117_1();
+    /// Text extracting LTR text with ligatures.
+    void testTdf115117_1a();
+    /// Test writing ToUnicode CMAP for RTL ligatures.
+    void testTdf115117_2();
+    /// Text extracting RTL text with ligatures.
+    void testTdf115117_2a();
+#endif
 #endif
 
     CPPUNIT_TEST_SUITE(PdfExportTest);
@@ -85,6 +97,12 @@ public:
     CPPUNIT_TEST(testTdf99680);
     CPPUNIT_TEST(testTdf99680_2);
     CPPUNIT_TEST(testTdf108963);
+#if !TEST_FONTS_MISSING
+    CPPUNIT_TEST(testTdf115117_1);
+    CPPUNIT_TEST(testTdf115117_1a);
+    CPPUNIT_TEST(testTdf115117_2);
+    CPPUNIT_TEST(testTdf115117_2a);
+#endif
 #endif
     CPPUNIT_TEST_SUITE_END();
 };
@@ -760,6 +778,205 @@ void PdfExportTest::testTdf108963()
 
     CPPUNIT_ASSERT_EQUAL(1, nYellowPathCount);
 }
+
+#if !TEST_FONTS_MISSING
+// This requires Carlito font, if it is missing the test will most likely
+// fail.
+void PdfExportTest::testTdf115117_1()
+{
+    vcl::filter::PDFDocument aDocument;
+    load("tdf115117-1.odt", aDocument);
+
+    vcl::filter::PDFObjectElement* pToUnicode = nullptr;
+
+    // Get access to ToUnicode of the first font
+    for (const auto& aElement : aDocument.GetElements())
+    {
+        auto pObject = dynamic_cast<vcl::filter::PDFObjectElement*>(aElement.get());
+        if (!pObject)
+            continue;
+        auto pType = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("Type"));
+        if (pType && pType->GetValue() == "Font")
+        {
+            auto pToUnicodeRef = dynamic_cast<vcl::filter::PDFReferenceElement*>(pObject->Lookup("ToUnicode"));
+            CPPUNIT_ASSERT(pToUnicodeRef);
+            pToUnicode = pToUnicodeRef->LookupObject();
+            break;
+        }
+    }
+
+    CPPUNIT_ASSERT(pToUnicode);
+    auto pStream = pToUnicode->GetStream();
+    CPPUNIT_ASSERT(pStream);
+    SvMemoryStream aObjectStream;
+    ZCodec aZCodec;
+    aZCodec.BeginCompression();
+    pStream->GetMemory().Seek(0);
+    aZCodec.Decompress(pStream->GetMemory(), aObjectStream);
+    CPPUNIT_ASSERT(aZCodec.EndCompression());
+    aObjectStream.Seek(0);
+    // The first values, <01> <02> etc., are glyph ids, they might change order
+    // if we changed how font subsets are created.
+    // The second values, <00740069> etc., are Unicode code points in hex,
+    // <00740069> is U+0074 and U+0069 i.e. "ti" which is a ligature in
+    // Carlito/Callibri. This test is failing if any of the second values
+    // changed which means we are not detecting ligatures and writing CMAP
+    // entries for them correctly. If glyph order in the subset changes then
+    // the order here will changes and the PDF has to be carefully inspected to
+    // ensure that the new values are correct before updating the string below.
+    OString aCmap("9 beginbfchar\n"
+                  "<01> <00740069>\n"
+                  "<02> <0020>\n"
+                  "<03> <0074>\n"
+                  "<04> <0065>\n"
+                  "<05> <0073>\n"
+                  "<06> <00660069>\n"
+                  "<07> <0066006C>\n"
+                  "<08> <006600660069>\n"
+                  "<09> <00660066006C>\n"
+                  "endbfchar");
+    auto pStart = static_cast<const char*>(aObjectStream.GetData());
+    const char* pEnd = pStart + aObjectStream.GetSize();
+    auto it = std::search(pStart, pEnd, aCmap.getStr(), aCmap.getStr() + aCmap.getLength());
+    CPPUNIT_ASSERT(it != pEnd);
+}
+
+// This requires DejaVu Sans font, if it is missing the test will most likely
+// fail.
+void PdfExportTest::testTdf115117_2()
+{
+    // See the comments in testTdf115117_1() for explanation.
+
+    vcl::filter::PDFDocument aDocument;
+    load("tdf115117-2.odt", aDocument);
+
+    vcl::filter::PDFObjectElement* pToUnicode = nullptr;
+
+    for (const auto& aElement : aDocument.GetElements())
+    {
+        auto pObject = dynamic_cast<vcl::filter::PDFObjectElement*>(aElement.get());
+        if (!pObject)
+            continue;
+        auto pType = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("Type"));
+        if (pType && pType->GetValue() == "Font")
+        {
+            auto pToUnicodeRef = dynamic_cast<vcl::filter::PDFReferenceElement*>(pObject->Lookup("ToUnicode"));
+            CPPUNIT_ASSERT(pToUnicodeRef);
+            pToUnicode = pToUnicodeRef->LookupObject();
+            break;
+        }
+    }
+
+    CPPUNIT_ASSERT(pToUnicode);
+    auto pStream = pToUnicode->GetStream();
+    CPPUNIT_ASSERT(pStream);
+    SvMemoryStream aObjectStream;
+    ZCodec aZCodec;
+    aZCodec.BeginCompression();
+    pStream->GetMemory().Seek(0);
+    aZCodec.Decompress(pStream->GetMemory(), aObjectStream);
+    CPPUNIT_ASSERT(aZCodec.EndCompression());
+    aObjectStream.Seek(0);
+    OString aCmap("7 beginbfchar\n"
+                  "<01> <06440627>\n"
+                  "<02> <0020>\n"
+                  "<03> <0641>\n"
+                  "<04> <0642>\n"
+                  "<05> <0648>\n"
+                  "<06> <06440627>\n"
+                  "<07> <0628>\n"
+                  "endbfchar");
+    auto pStart = static_cast<const char*>(aObjectStream.GetData());
+    const char* pEnd = pStart + aObjectStream.GetSize();
+    auto it = std::search(pStart, pEnd, aCmap.getStr(), aCmap.getStr() + aCmap.getLength());
+    CPPUNIT_ASSERT(it != pEnd);
+}
+
+void PdfExportTest::testTdf115117_1a()
+{
+    // Import the bugdoc and export as PDF.
+    OUString aURL = m_directories.getURLFromSrc(DATA_DIRECTORY) + "tdf115117-1.odt";
+    mxComponent = loadFromDesktop(aURL);
+    CPPUNIT_ASSERT(mxComponent.is());
+
+    uno::Reference<frame::XStorable> xStorable(mxComponent, uno::UNO_QUERY);
+    utl::TempFile aTempFile;
+    aTempFile.EnableKillingFile();
+    utl::MediaDescriptor aMediaDescriptor;
+    aMediaDescriptor["FilterName"] <<= OUString("writer_pdf_Export");
+    xStorable->storeToURL(aTempFile.GetURL(), aMediaDescriptor.getAsConstPropertyValueList());
+
+    // Parse the export result with pdfium.
+    SvFileStream aFile(aTempFile.GetURL(), StreamMode::READ);
+    SvMemoryStream aMemory;
+    aMemory.WriteStream(aFile);
+    mpPdfDocument = FPDF_LoadMemDocument(aMemory.GetData(), aMemory.GetSize(), /*password=*/nullptr);
+    CPPUNIT_ASSERT(mpPdfDocument);
+
+    // The document has one page.
+    CPPUNIT_ASSERT_EQUAL(1, FPDF_GetPageCount(mpPdfDocument));
+    mpPdfPage = FPDF_LoadPage(mpPdfDocument, /*page_index=*/0);
+    CPPUNIT_ASSERT(mpPdfPage);
+
+    auto pPdfTextPage = FPDFText_LoadPage(mpPdfPage);
+    CPPUNIT_ASSERT(pPdfTextPage);
+
+    // Extract the text from the page. This pdfium API is a bit higher level
+    // than we want and might apply heuristic that give false positive, but it
+    // is a good approximation in addition to the check in testTdf115117_1().
+    int nChars = FPDFText_CountChars(pPdfTextPage);
+    CPPUNIT_ASSERT_EQUAL(44, nChars);
+
+    OUString aExpectedText = "ti ti test ti\r\nti test fi fl ffi ffl test fi";
+    std::vector<sal_uInt32> aChars(nChars);
+    for (int i = 0; i < nChars; i++)
+        aChars[i] = FPDFText_GetUnicode(pPdfTextPage, i);
+    OUString aActualText(aChars.data(), aChars.size());
+    CPPUNIT_ASSERT_EQUAL(aExpectedText, aActualText);
+}
+
+void PdfExportTest::testTdf115117_2a()
+{
+    // See the comments in testTdf115117_1a() for explanation.
+
+    // Import the bugdoc and export as PDF.
+    OUString aURL = m_directories.getURLFromSrc(DATA_DIRECTORY) + "tdf115117-2.odt";
+    mxComponent = loadFromDesktop(aURL);
+    CPPUNIT_ASSERT(mxComponent.is());
+
+    uno::Reference<frame::XStorable> xStorable(mxComponent, uno::UNO_QUERY);
+    utl::TempFile aTempFile;
+    aTempFile.EnableKillingFile();
+    utl::MediaDescriptor aMediaDescriptor;
+    aMediaDescriptor["FilterName"] <<= OUString("writer_pdf_Export");
+    xStorable->storeToURL(aTempFile.GetURL(), aMediaDescriptor.getAsConstPropertyValueList());
+
+    // Parse the export result with pdfium.
+    SvFileStream aFile(aTempFile.GetURL(), StreamMode::READ);
+    SvMemoryStream aMemory;
+    aMemory.WriteStream(aFile);
+    mpPdfDocument = FPDF_LoadMemDocument(aMemory.GetData(), aMemory.GetSize(), /*password=*/nullptr);
+    CPPUNIT_ASSERT(mpPdfDocument);
+
+    // The document has one page.
+    CPPUNIT_ASSERT_EQUAL(1, FPDF_GetPageCount(mpPdfDocument));
+    mpPdfPage = FPDF_LoadPage(mpPdfDocument, /*page_index=*/0);
+    CPPUNIT_ASSERT(mpPdfPage);
+
+    auto pPdfTextPage = FPDFText_LoadPage(mpPdfPage);
+    CPPUNIT_ASSERT(pPdfTextPage);
+
+    int nChars = FPDFText_CountChars(pPdfTextPage);
+    CPPUNIT_ASSERT_EQUAL(13, nChars);
+
+    OUString aExpectedText = u"\u0627\u0644 \u0628\u0627\u0644 \u0648\u0642\u0641 \u0627\u0644";
+    std::vector<sal_uInt32> aChars(nChars);
+    for (int i = 0; i < nChars; i++)
+        aChars[i] = FPDFText_GetUnicode(pPdfTextPage, i);
+    OUString aActualText(aChars.data(), aChars.size());
+    CPPUNIT_ASSERT_EQUAL(aExpectedText, aActualText);
+}
+#endif
 #endif
 
 CPPUNIT_TEST_SUITE_REGISTRATION(PdfExportTest);
diff --git a/vcl/source/gdi/CommonSalLayout.cxx b/vcl/source/gdi/CommonSalLayout.cxx
index 2f110b138a8a..cfd86ed27409 100644
--- a/vcl/source/gdi/CommonSalLayout.cxx
+++ b/vcl/source/gdi/CommonSalLayout.cxx
@@ -690,6 +690,49 @@ bool CommonSalLayout::LayoutText(ImplLayoutArgs& rArgs)
             for (int i = 0; i < nRunGlyphCount; ++i) {
                 int32_t nGlyphIndex = pHbGlyphInfos[i].codepoint;
                 int32_t nCharPos = pHbGlyphInfos[i].cluster;
+                int32_t nCharCount = 0;
+
+                // Find the number of characters that make up this glyph.
+                if (!bRightToLeft)
+                {
+                    // If the cluster is the same as previous glyph, then this
+                    // already consumed, skip.
+                    if (i > 0 && pHbGlyphInfos[i].cluster == pHbGlyphInfos[i - 1].cluster)
+                        nCharCount = 0;
+                    else
+                    {
+                        // Find the next glyph with a different cluster, or the
+                        // end of text.
+                        int j = i;
+                        int32_t nNextCharPos = nCharPos;
+                        while (nNextCharPos == nCharPos && j < nRunGlyphCount)
+                            nNextCharPos = pHbGlyphInfos[j++].cluster;
+
+                        if (nNextCharPos == nCharPos)
+                            nNextCharPos = rArgs.mnEndCharPos;
+                        nCharCount = nNextCharPos - nCharPos;
+                    }
+                }
+                else
+                {
+                    // If the cluster is the same as previous glyph, then this
+                    // will be consumed later, skip.
+                    if (i < nRunGlyphCount - 1 && pHbGlyphInfos[i].cluster == pHbGlyphInfos[i + 1].cluster)
+                        nCharCount = 0;
+                    else
+                    {
+                        // Find the previous glyph with a different cluster, or
+                        // the end of text.
+                        int j = i;
+                        int32_t nNextCharPos = nCharPos;
+                        while (nNextCharPos == nCharPos && j >= 0)
+                            nNextCharPos = pHbGlyphInfos[j--].cluster;
+
+                        if (nNextCharPos == nCharPos)
+                            nNextCharPos = rArgs.mnEndCharPos;
+                        nCharCount = nNextCharPos - nCharPos;
+                    }
+                }
 
                 // if needed request glyph fallback by updating LayoutArgs
                 if (!nGlyphIndex)
@@ -756,7 +799,7 @@ bool CommonSalLayout::LayoutText(ImplLayoutArgs& rArgs)
                 nYOffset = std::lround(nYOffset * nYScale);
 
                 Point aNewPos(aCurrPos.X() + nXOffset, aCurrPos.Y() + nYOffset);
-                const GlyphItem aGI(nCharPos, nGlyphIndex, aNewPos, nGlyphFlags,
+                const GlyphItem aGI(nCharPos, nCharCount, nGlyphIndex, aNewPos, nGlyphFlags,
                                     nAdvance, nXOffset);
                 AppendGlyph(aGI);
 
diff --git a/vcl/source/gdi/pdfwriter_impl.cxx b/vcl/source/gdi/pdfwriter_impl.cxx
index 03b1a1d9e12d..58711a9d862b 100644
--- a/vcl/source/gdi/pdfwriter_impl.cxx
+++ b/vcl/source/gdi/pdfwriter_impl.cxx
@@ -6598,7 +6598,6 @@ void PDFWriterImpl::drawLayout( SalLayout& rLayout, const OUString& rText, bool
     bool bVertical = m_aCurrentPDFState.m_aFont.IsVertical();
     int nGlyphs;
     int nIndex = 0;
-    int nMaxCharPos = rText.getLength()-1;
     double fXScale = 1.0;
     double fSkew = 0.0;
     sal_Int32 nPixelFontHeight = m_pReferenceDevice->mpFontInstance->maFontSelData.mnHeight;
@@ -6717,48 +6716,25 @@ void PDFWriterImpl::drawLayout( SalLayout& rLayout, const OUString& rText, bool
     FontMetric aRefDevFontMetric = m_pReferenceDevice->GetFontMetric();
 
     // collect the glyphs into a single array
-    const int nTmpMaxGlyphs = rLayout.GetOrientation() ? 1 : nMaxGlyphs; // #i97991# temporary workaround for #i87686#
     std::vector< PDFGlyph > aGlyphs;
-    aGlyphs.reserve( nTmpMaxGlyphs );
+    aGlyphs.reserve( nMaxGlyphs );
     // first get all the glyphs and register them; coordinates still in Pixel
     Point aGNGlyphPos;
-    while ((nGlyphs = rLayout.GetNextGlyphs(nTmpMaxGlyphs, pGlyphs, aGNGlyphPos, nIndex, pFallbackFonts)) != 0)
+    while ((nGlyphs = rLayout.GetNextGlyphs(nMaxGlyphs, pGlyphs, aGNGlyphPos, nIndex, pFallbackFonts)) != 0)
     {
         aCodeUnits.clear();
+        aCodeUnitsPerGlyph.clear();
         for( int i = 0; i < nGlyphs; i++ )
         {
-            // default case: 1 glyph is one unicode
-            aCodeUnitsPerGlyph.push_back(1);
-            if (pGlyphs[i]->mnCharPos >= 0 && pGlyphs[i]->mnCharPos <= nMaxCharPos)
-            {
-                int nChars = 1;
-                // try to handle ligatures and such
-                if( i < nGlyphs-1 )
-                {
-                    nChars = pGlyphs[i+1]->mnCharPos - pGlyphs[i]->mnCharPos;
-                    int start = pGlyphs[i]->mnCharPos;
-                    // #i115618# fix for simple RTL+CTL cases
-                    // supports RTL ligatures. TODO: more complex CTL, etc.
-                    if( nChars < 0 )
-                    {
-                        nChars = -nChars;
-                        start = pGlyphs[i+1]->mnCharPos + 1;
-                    }
-                    else if (nChars == 0)
-                        nChars = 1;
-                    aCodeUnitsPerGlyph.back() = nChars;
-                    for( int n = 0; n < nChars; n++ )
-                        aCodeUnits.push_back( rText[ start + n ] );
-                }
-                else
-                    aCodeUnits.push_back(rText[pGlyphs[i]->mnCharPos]);
-            }
-            else
-                aCodeUnits.push_back( 0 );
-            // note: in case of ctl one character may result
-            // in multiple glyphs. The current SalLayout
-            // implementations set -1 then to indicate that no direct
-            // mapping is possible
+            // try to handle ligatures and such
+            int nStart = pGlyphs[i]->mnCharPos;
+            int nChars = pGlyphs[i]->mnCharCount;
+            if (nChars < 0)
+                nChars = 0;
+
+            aCodeUnitsPerGlyph.push_back(nChars);
+            for( int n = 0; n < nChars; n++ )
+                aCodeUnits.push_back( rText[ nStart + n ] );
         }
 
         registerGlyphs( nGlyphs, pGlyphs, pGlyphWidths, aCodeUnits.data(), aCodeUnitsPerGlyph.data(), pMappedGlyphs, pMappedFontObjects, pFallbackFonts );
author	Khaled Hosny <khaledhosny@eglug.org>	2018-03-21 16:54:10 +0200
committer	Miklos Vajna <vmiklos@collabora.co.uk>	2018-03-23 09:25:42 +0100
commit	90fb652ebbc4b16ae5001140076f52209e913345 (patch)
tree	35238e8dfbd29c6535867acf944fe70822e3e437
parent	deb5db7bf6d45338c9b6f6f4a1d62fba168b7a85 (diff)