6 files changed, 277 insertions, 38 deletions
diff --git a/vcl/inc/sallayout.hxx b/vcl/inc/sallayout.hxx
index ff008c44dd55..c4a29dc9d0c9 100644
--- a/vcl/inc/sallayout.hxx
+++ b/vcl/inc/sallayout.hxx
@@ -255,6 +255,7 @@ struct GlyphItem
 {
     int     mnFlags;
     int     mnCharPos;      // index in string
+    int     mnCharCount;    // number of characters making up this glyph
 
     int     mnOrigWidth;    // original glyph width
     int     mnNewWidth;     // width after adjustments
@@ -270,6 +271,7 @@ public:
                 long nFlags, int nOrigWidth )
             :   mnFlags(nFlags)
             ,   mnCharPos(nCharPos)
+            ,   mnCharCount(1)
             ,   mnOrigWidth(nOrigWidth)
             ,   mnNewWidth(nOrigWidth)
             ,   mnXOffset(0)
@@ -278,10 +280,11 @@ public:
             ,   mnFallbackLevel(0)
             { }
 
-            GlyphItem( int nCharPos, sal_GlyphId aGlyphId, const Point& rLinearPos,
+            GlyphItem(int nCharPos, int nCharCount, sal_GlyphId aGlyphId, const Point& rLinearPos,
                 long nFlags, int nOrigWidth, int nXOffset )
             :   mnFlags(nFlags)
             ,   mnCharPos(nCharPos)
+            ,   mnCharCount(nCharCount)
             ,   mnOrigWidth(nOrigWidth)
             ,   mnNewWidth(nOrigWidth)
             ,   mnXOffset(nXOffset)
diff --git a/vcl/qa/cppunit/pdfexport/data/tdf115117-1.odt b/vcl/qa/cppunit/pdfexport/data/tdf115117-1.odt
new file mode 100644
index 000000000000..63fe82946ef1
--- /dev/null
+++ b/vcl/qa/cppunit/pdfexport/data/tdf115117-1.odt
diff --git a/vcl/qa/cppunit/pdfexport/data/tdf115117-2.odt b/vcl/qa/cppunit/pdfexport/data/tdf115117-2.odt
new file mode 100644
index 000000000000..c1e1f6d4392c
--- /dev/null
+++ b/vcl/qa/cppunit/pdfexport/data/tdf115117-2.odt
diff --git a/vcl/qa/cppunit/pdfexport/pdfexport.cxx b/vcl/qa/cppunit/pdfexport/pdfexport.cxx
index a904a5dc638d..ba8df2f1a616 100644
--- a/vcl/qa/cppunit/pdfexport/pdfexport.cxx
+++ b/vcl/qa/cppunit/pdfexport/pdfexport.cxx
@@ -8,6 +8,7 @@
  */
 
 #include <config_features.h>
+#include <config_test.h>
 
 #include <com/sun/star/frame/Desktop.hpp>
 #include <com/sun/star/frame/XStorable.hpp>
@@ -25,6 +26,7 @@
 #include <tools/zcodec.hxx>
 #if HAVE_FEATURE_PDFIUM
 #include <fpdf_edit.h>
+#include <fpdf_text.h>
 #include <fpdfview.h>
 #endif
 
@@ -67,6 +69,16 @@ public:
     void testTdf99680();
     void testTdf99680_2();
     void testTdf108963();
+#if !TEST_FONTS_MISSING
+    /// Test writing ToUnicode CMAP for LTR ligatures.
+    void testTdf115117_1();
+    /// Text extracting LTR text with ligatures.
+    void testTdf115117_1a();
+    /// Test writing ToUnicode CMAP for RTL ligatures.
+    void testTdf115117_2();
+    /// Text extracting RTL text with ligatures.
+    void testTdf115117_2a();
+#endif
 #endif
 
     CPPUNIT_TEST_SUITE(PdfExportTest);
@@ -85,6 +97,12 @@ public:
     CPPUNIT_TEST(testTdf99680);
     CPPUNIT_TEST(testTdf99680_2);
     CPPUNIT_TEST(testTdf108963);
+#if !TEST_FONTS_MISSING
+    CPPUNIT_TEST(testTdf115117_1);
+    CPPUNIT_TEST(testTdf115117_1a);
+    CPPUNIT_TEST(testTdf115117_2);
+    CPPUNIT_TEST(testTdf115117_2a);
+#endif
 #endif
     CPPUNIT_TEST_SUITE_END();
 };
@@ -760,6 +778,205 @@ void PdfExportTest::testTdf108963()
 
     CPPUNIT_ASSERT_EQUAL(1, nYellowPathCount);
 }
+
+#if !TEST_FONTS_MISSING
+// This requires Carlito font, if it is missing the test will most likely
+// fail.
+void PdfExportTest::testTdf115117_1()
+{
+    vcl::filter::PDFDocument aDocument;
+    load("tdf115117-1.odt", aDocument);
+
+    vcl::filter::PDFObjectElement* pToUnicode = nullptr;
+
+    // Get access to ToUnicode of the first font
+    for (const auto& aElement : aDocument.GetElements())
+    {
+        auto pObject = dynamic_cast<vcl::filter::PDFObjectElement*>(aElement.get());
+        if (!pObject)
+            continue;
+        auto pType = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("Type"));
+        if (pType && pType->GetValue() == "Font")
+        {
+            auto pToUnicodeRef = dynamic_cast<vcl::filter::PDFReferenceElement*>(pObject->Lookup("ToUnicode"));
+            CPPUNIT_ASSERT(pToUnicodeRef);
+            pToUnicode = pToUnicodeRef->LookupObject();
+            break;
+        }
+    }
+
+    CPPUNIT_ASSERT(pToUnicode);
+    auto pStream = pToUnicode->GetStream();
+    CPPUNIT_ASSERT(pStream);
+    SvMemoryStream aObjectStream;
+    ZCodec aZCodec;
+    aZCodec.BeginCompression();
+    pStream->GetMemory().Seek(0);
+    aZCodec.Decompress(pStream->GetMemory(), aObjectStream);
+    CPPUNIT_ASSERT(aZCodec.EndCompression());
+    aObjectStream.Seek(0);
+    // The first values, <01> <02> etc., are glyph ids, they might change order
+    // if we changed how font subsets are created.
+    // The second values, <00740069> etc., are Unicode code points in hex,
+    // <00740069> is U+0074 and U+0069 i.e. "ti" which is a ligature in
+    // Carlito/Callibri. This test is failing if any of the second values
+    // changed which means we are not detecting ligatures and writing CMAP
+    // entries for them correctly. If glyph order in the subset changes then
+    // the order here will changes and the PDF has to be carefully inspected to
+    // ensure that the new values are correct before updating the string below.
+    OString aCmap("9 beginbfchar\n"
+                  "<01> <00740069>\n"
+                  "<02> <0020>\n"
+                  "<03> <0074>\n"
+                  "<04> <0065>\n"
+                  "<05> <0073>\n"
+                  "<06> <00660069>\n"
+                  "<07> <0066006C>\n"
+                  "<08> <006600660069>\n"
+                  "<09> <00660066006C>\n"
+                  "endbfchar");
+    auto pStart = static_cast<const char*>(aObjectStream.GetData());
+    const char* pEnd = pStart + aObjectStream.GetSize();
+    auto it = std::search(pStart, pEnd, aCmap.getStr(), aCmap.getStr() + aCmap.getLength());
+    CPPUNIT_ASSERT(it != pEnd);
+}
+
+// This requires DejaVu Sans font, if it is missing the test will most likely
+// fail.
+void PdfExportTest::testTdf115117_2()
+{
+    // See the comments in testTdf115117_1() for explanation.
+
+    vcl::filter::PDFDocument aDocument;
+    load("tdf115117-2.odt", aDocument);
+
+    vcl::filter::PDFObjectElement* pToUnicode = nullptr;
+
+    for (const auto& aElement : aDocument.GetElements())
+    {
+        auto pObject = dynamic_cast<vcl::filter::PDFObjectElement*>(aElement.get());
+        if (!pObject)
+            continue;
+        auto pType = dynamic_cast<vcl::filter::PDFNameElement*>(pObject->Lookup("Type"));
+        if (pType && pType->GetValue() == "Font")
+        {
+            auto pToUnicodeRef = dynamic_cast<vcl::filter::PDFReferenceElement*>(pObject->Lookup("ToUnicode"));
+            CPPUNIT_ASSERT(pToUnicodeRef);
+            pToUnicode = pToUnicodeRef->LookupObject();
+            break;
+        }
+    }
+
+    CPPUNIT_ASSERT(pToUnicode);
+    auto pStream = pToUnicode->GetStream();
+    CPPUNIT_ASSERT(pStream);
+    SvMemoryStream aObjectStream;
+    ZCodec aZCodec;
+    aZCodec.BeginCompression();
+    pStream->GetMemory().Seek(0);
+    aZCodec.Decompress(pStream->GetMemory(), aObjectStream);
+    CPPUNIT_ASSERT(aZCodec.EndCompression());
+    aObjectStream.Seek(0);
+    OString aCmap("7 beginbfchar\n"
+                  "<01> <06440627>\n"
+                  "<02> <0020>\n"
+                  "<03> <0641>\n"
+                  "<04> <0642>\n"
+                  "<05> <0648>\n"
+                  "<06> <06440627>\n"
+                  "<07> <0628>\n"
+                  "endbfchar");
+    auto pStart = static_cast<const char*>(aObjectStream.GetData());
+    const char* pEnd = pStart + aObjectStream.GetSize();
+    auto it = std::search(pStart, pEnd, aCmap.getStr(), aCmap.getStr() + aCmap.getLength());
+    CPPUNIT_ASSERT(it != pEnd);
+}
+
+void PdfExportTest::testTdf115117_1a()
+{
+    // Import the bugdoc and export as PDF.
+    OUString aURL = m_directories.getURLFromSrc(DATA_DIRECTORY) + "tdf115117-1.odt";
+    mxComponent = loadFromDesktop(aURL);
+    CPPUNIT_ASSERT(mxComponent.is());
+
+    uno::Reference<frame::XStorable> xStorable(mxComponent, uno::UNO_QUERY);
+    utl::TempFile aTempFile;
+    aTempFile.EnableKillingFile();
+    utl::MediaDescriptor aMediaDescriptor;
+    aMediaDescriptor["FilterName"] <<= OUString("writer_pdf_Export");
+    xStorable->storeToURL(aTempFile.GetURL(), aMediaDescriptor.getAsConstPropertyValueList());
+
+    // Parse the export result with pdfium.
+    SvFileStream aFile(aTempFile.GetURL(), StreamMode::READ);
+    SvMemoryStream aMemory;
+    aMemory.WriteStream(aFile);
+    mpPdfDocument = FPDF_LoadMemDocument(aMemory.GetData(), aMemory.GetSize(), /*password=*/nullptr);
+    CPPUNIT_ASSERT(mpPdfDocument);
+
+    // The document has one page.
+    CPPUNIT_ASSERT_EQUAL(1, FPDF_GetPageCount(mpPdfDocument));
+    mpPdfPage = FPDF_LoadPage(mpPdfDocument, /*page_index=*/0);
+    CPPUNIT_ASSERT(mpPdfPage);
+
+    auto pPdfTextPage = FPDFText_LoadPage(mpPdfPage);
+    CPPUNIT_ASSERT(pPdfTextPage);
+
+    // Extract the text from the page. This pdfium API is a bit higher level
+    // than we want and might apply heuristic that give false positive, but it
+    // is a good approximation in addition to the check in testTdf115117_1().
+    int nChars = FPDFText_CountChars(pPdfTextPage);
+    CPPUNIT_ASSERT_EQUAL(44, nChars);
+
+    OUString aExpectedText = "ti ti test ti\r\nti test fi fl ffi ffl test fi";
+    std::vector<sal_uInt32> aChars(nChars);
+    for (int i = 0; i < nChars; i++)
+        aChars[i] = FPDFText_GetUnicode(pPdfTextPage, i);
+    OUString aActualText(aChars.data(), aChars.size());
+    CPPUNIT_ASSERT_EQUAL(aExpectedText, aActualText);
+}
+
+void PdfExportTest::testTdf115117_2a()
+{
+    // See the comments in testTdf115117_1a() for explanation.
+
+    // Import the bugdoc and export as PDF.
+    OUString aURL = m_directories.getURLFromSrc(DATA_DIRECTORY) + "tdf115117-2.odt";
+    mxComponent = loadFromDesktop(aURL);
+    CPPUNIT_ASSERT(mxComponent.is());
+
+    uno::Reference<frame::XStorable> xStorable(mxComponent, uno::UNO_QUERY);
+    utl::TempFile aTempFile;
+    aTempFile.EnableKillingFile();
+    utl::MediaDescriptor aMediaDescriptor;
+    aMediaDescriptor["FilterName"] <<= OUString("writer_pdf_Export");
+    xStorable->storeToURL(aTempFile.GetURL(), aMediaDescriptor.getAsConstPropertyValueList());
+
+    // Parse the export result with pdfium.
+    SvFileStream aFile(aTempFile.GetURL(), StreamMode::READ);
+    SvMemoryStream aMemory;
+    aMemory.WriteStream(aFile);
+    mpPdfDocument = FPDF_LoadMemDocument(aMemory.GetData(), aMemory.GetSize(), /*password=*/nullptr);
+    CPPUNIT_ASSERT(mpPdfDocument);
+
+    // The document has one page.
+    CPPUNIT_ASSERT_EQUAL(1, FPDF_GetPageCount(mpPdfDocument));
+    mpPdfPage = FPDF_LoadPage(mpPdfDocument, /*page_index=*/0);
+    CPPUNIT_ASSERT(mpPdfPage);
+
+    auto pPdfTextPage = FPDFText_LoadPage(mpPdfPage);
+    CPPUNIT_ASSERT(pPdfTextPage);
+
+    int nChars = FPDFText_CountChars(pPdfTextPage);
+    CPPUNIT_ASSERT_EQUAL(13, nChars);
+
+    OUString aExpectedText = u"\u0627\u0644 \u0628\u0627\u0644 \u0648\u0642\u0641 \u0627\u0644";
+    std::vector<sal_uInt32> aChars(nChars);
+    for (int i = 0; i < nChars; i++)
+        aChars[i] = FPDFText_GetUnicode(pPdfTextPage, i);
+    OUString aActualText(aChars.data(), aChars.size());
+    CPPUNIT_ASSERT_EQUAL(aExpectedText, aActualText);
+}
+#endif
 #endif
 
 CPPUNIT_TEST_SUITE_REGISTRATION(PdfExportTest);
diff --git a/vcl/source/gdi/CommonSalLayout.cxx b/vcl/source/gdi/CommonSalLayout.cxx
index 2f110b138a8a..cfd86ed27409 100644
--- a/vcl/source/gdi/CommonSalLayout.cxx
+++ b/vcl/source/gdi/CommonSalLayout.cxx
@@ -690,6 +690,49 @@ bool CommonSalLayout::LayoutText(ImplLayoutArgs& rArgs)
             for (int i = 0; i < nRunGlyphCount; ++i) {
                 int32_t nGlyphIndex = pHbGlyphInfos[i].codepoint;
                 int32_t nCharPos = pHbGlyphInfos[i].cluster;
+                int32_t nCharCount = 0;
+
+                // Find the number of characters that make up this glyph.
+                if (!bRightToLeft)
+                {
+                    // If the cluster is the same as previous glyph, then this
+                    // already consumed, skip.
+                    if (i > 0 && pHbGlyphInfos[i].cluster == pHbGlyphInfos[i - 1].cluster)
+                        nCharCount = 0;
+                    else
+                    {
+                        // Find the next glyph with a different cluster, or the
+                        // end of text.
+                        int j = i;
+                        int32_t nNextCharPos = nCharPos;
+                        while (nNextCharPos == nCharPos && j < nRunGlyphCount)
+                            nNextCharPos = pHbGlyphInfos[j++].cluster;
+
+                        if (nNextCharPos == nCharPos)
+                            nNextCharPos = rArgs.mnEndCharPos;
+                        nCharCount = nNextCharPos - nCharPos;
+                    }
+                }
+                else
+                {
+                    // If the cluster is the same as previous glyph, then this
+                    // will be consumed later, skip.
+                    if (i < nRunGlyphCount - 1 && pHbGlyphInfos[i].cluster == pHbGlyphInfos[i + 1].cluster)
+                        nCharCount = 0;
+                    else
+                    {
+                        // Find the previous glyph with a different cluster, or
+                        // the end of text.
+                        int j = i;
+                        int32_t nNextCharPos = nCharPos;
+                        while (nNextCharPos == nCharPos && j >= 0)
+                            nNextCharPos = pHbGlyphInfos[j--].cluster;
+
+                        if (nNextCharPos == nCharPos)
+                            nNextCharPos = rArgs.mnEndCharPos;
+                        nCharCount = nNextCharPos - nCharPos;
+                    }
+                }
 
                 // if needed request glyph fallback by updating LayoutArgs
                 if (!nGlyphIndex)
@@ -756,7 +799,7 @@ bool CommonSalLayout::LayoutText(ImplLayoutArgs& rArgs)
                 nYOffset = std::lround(nYOffset * nYScale);
 
                 Point aNewPos(aCurrPos.X() + nXOffset, aCurrPos.Y() + nYOffset);
-                const GlyphItem aGI(nCharPos, nGlyphIndex, aNewPos, nGlyphFlags,
+                const GlyphItem aGI(nCharPos, nCharCount, nGlyphIndex, aNewPos, nGlyphFlags,
                                     nAdvance, nXOffset);
                 AppendGlyph(aGI);
 
diff --git a/vcl/source/gdi/pdfwriter_impl.cxx b/vcl/source/gdi/pdfwriter_impl.cxx
index 03b1a1d9e12d..58711a9d862b 100644
--- a/vcl/source/gdi/pdfwriter_impl.cxx
+++ b/vcl/source/gdi/pdfwriter_impl.cxx
@@ -6598,7 +6598,6 @@ void PDFWriterImpl::drawLayout( SalLayout& rLayout, const OUString& rText, bool
     bool bVertical = m_aCurrentPDFState.m_aFont.IsVertical();
     int nGlyphs;
     int nIndex = 0;
-    int nMaxCharPos = rText.getLength()-1;
     double fXScale = 1.0;
     double fSkew = 0.0;
     sal_Int32 nPixelFontHeight = m_pReferenceDevice->mpFontInstance->maFontSelData.mnHeight;
@@ -6717,48 +6716,25 @@ void PDFWriterImpl::drawLayout( SalLayout& rLayout, const OUString& rText, bool
     FontMetric aRefDevFontMetric = m_pReferenceDevice->GetFontMetric();
 
     // collect the glyphs into a single array
-    const int nTmpMaxGlyphs = rLayout.GetOrientation() ? 1 : nMaxGlyphs; // #i97991# temporary workaround for #i87686#
     std::vector< PDFGlyph > aGlyphs;
-    aGlyphs.reserve( nTmpMaxGlyphs );
+    aGlyphs.reserve( nMaxGlyphs );
     // first get all the glyphs and register them; coordinates still in Pixel
     Point aGNGlyphPos;
-    while ((nGlyphs = rLayout.GetNextGlyphs(nTmpMaxGlyphs, pGlyphs, aGNGlyphPos, nIndex, pFallbackFonts)) != 0)
+    while ((nGlyphs = rLayout.GetNextGlyphs(nMaxGlyphs, pGlyphs, aGNGlyphPos, nIndex, pFallbackFonts)) != 0)
     {
         aCodeUnits.clear();
+        aCodeUnitsPerGlyph.clear();
         for( int i = 0; i < nGlyphs; i++ )
         {
-            // default case: 1 glyph is one unicode
-            aCodeUnitsPerGlyph.push_back(1);
-            if (pGlyphs[i]->mnCharPos >= 0 && pGlyphs[i]->mnCharPos <= nMaxCharPos)
-            {
-                int nChars = 1;
-                // try to handle ligatures and such
-                if( i < nGlyphs-1 )
-                {
-                    nChars = pGlyphs[i+1]->mnCharPos - pGlyphs[i]->mnCharPos;
-                    int start = pGlyphs[i]->mnCharPos;
-                    // #i115618# fix for simple RTL+CTL cases
-                    // supports RTL ligatures. TODO: more complex CTL, etc.
-                    if( nChars < 0 )
-                    {
-                        nChars = -nChars;
-                        start = pGlyphs[i+1]->mnCharPos + 1;
-                    }
-                    else if (nChars == 0)
-                        nChars = 1;
-                    aCodeUnitsPerGlyph.back() = nChars;
-                    for( int n = 0; n < nChars; n++ )
-                        aCodeUnits.push_back( rText[ start + n ] );
-                }
-                else
-                    aCodeUnits.push_back(rText[pGlyphs[i]->mnCharPos]);
-            }
-            else
-                aCodeUnits.push_back( 0 );
-            // note: in case of ctl one character may result
-            // in multiple glyphs. The current SalLayout
-            // implementations set -1 then to indicate that no direct
-            // mapping is possible
+            // try to handle ligatures and such
+            int nStart = pGlyphs[i]->mnCharPos;
+            int nChars = pGlyphs[i]->mnCharCount;
+            if (nChars < 0)
+                nChars = 0;
+
+            aCodeUnitsPerGlyph.push_back(nChars);
+            for( int n = 0; n < nChars; n++ )
+                aCodeUnits.push_back( rText[ nStart + n ] );
         }
 
         registerGlyphs( nGlyphs, pGlyphs, pGlyphWidths, aCodeUnits.data(), aCodeUnitsPerGlyph.data(), pMappedGlyphs, pMappedFontObjects, pFallbackFonts );