vcl PDF tokenizer: fix EOF position when \r is not followed by \n

Otherwise this would break partial tokenize when we only read a trailer in the middle of the file: m_aEOFs.back() is one byte larger than rStream.Tell(), so we reader past the end of the trailer, resulting in a tokenize failure. What's special about the bugdoc: - it has 2 xrefs, the first is incomplete, and refers to a second which is later in the file - the object length is as indirect object, triggering an xref lookup - the first EOF is followed by a \r, but then not with a \n This results in reading past the end of the first trailer and then triggering a lookup failure. FWIW, pdfium does the same in <https://pdfium.googlesource.com/pdfium/+/59d107323f6727bbd5f8a4d0843081790638a1dd/core/fpdfapi/parser/cpdf_syntax_parser.cpp#446>, we're on in sync with it. (cherry picked from commit 6b1d5bafdc722d07d3dc4980764275a6caa707ba) Conflicts: vcl/qa/cppunit/filter/ipdf/ipdf.cxx Change-Id: Ia556a25e333b5e4f1418d92a98d74358862120e2 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/115537 Tested-by: Jenkins CollaboraOffice <jenkinscollaboraoffice@gmail.com> Reviewed-by: Tomaž Vajngerl <quikee@gmail.com>
author: Miklos Vajna <vmiklos@collabora.com> 2021-05-12 10:51:09 +0200
committer: Andras Timar <andras.timar@collabora.com> 2021-05-27 11:09:27 +0200
commit: 5f00200b8d6cb095156b5ea963a8cad3bbd918a6 (patch)
tree: eb850e3e948d81e4bb5b9aed38cee8fd877438c6
parent: 8def73a66021cccb241bf4916ef051932bd5730c (diff)
3 files changed, 94 insertions, 1 deletions
diff --git a/vcl/qa/cppunit/filter/ipdf/data/comment-end.pdf b/vcl/qa/cppunit/filter/ipdf/data/comment-end.pdf
new file mode 100644
index 000000000000..6f1ad86f5c99
--- /dev/null
+++ b/vcl/qa/cppunit/filter/ipdf/data/comment-end.pdf
@@ -0,0 +1,69 @@
+%PDF-1.7
+%���
+1 0 obj <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+  /Type /Pages
+  /MediaBox [0 0 200 300]
+  /Count 1
+  /Kids [3 0 R]
+>>
+endobj
+3 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /Contents 4 0 R
+>>
+endobj
+4 0 obj <<
+  /Length 4
+>>
+stream
+q
+Q
+endstream
+endobj
+xref
+0 5
+0000000000 65535 f 
+0000000015 00000 n 
+0000000068 00000 n 
+0000000157 00000 n 
+0000000226 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 5
+  /Prev 541
+>>
+startxref
+280
+%%EOF
+%%TEST
+4 0 obj <<
+  /Length 5 0 R
+>>
+stream
+q
+Q
+endstream
+endobj
+5 0 obj
+4
+endobj
+xref
+0 6
+0000000000 65535 f 
+0000000015 00000 n 
+0000000068 00000 n 
+0000000157 00000 n 
+0000000466 00000 n 
+0000000524 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 6
+>>
+startxref
+280
+%%EOF
diff --git a/vcl/qa/cppunit/filter/ipdf/ipdf.cxx b/vcl/qa/cppunit/filter/ipdf/ipdf.cxx
index 5055e36a922e..3307db5c9743 100644
--- a/vcl/qa/cppunit/filter/ipdf/ipdf.cxx
+++ b/vcl/qa/cppunit/filter/ipdf/ipdf.cxx
@@ -168,6 +168,25 @@ CPPUNIT_TEST_FIXTURE(VclFilterIpdfTest, testDictArrayDict)
     CPPUNIT_ASSERT(pKey);
 }
 
+CPPUNIT_TEST_FIXTURE(VclFilterIpdfTest, testCommentEnd)
+{
+    // Load the test document:
+    // - it has two xrefs
+    // - second xref has an updated page content object with an indirect length
+    // - last startxref refers to the first xref
+    // - first xref has a /Prev to the second xref
+    // - first xref is terminated by a \r, which is not followed by a newline
+    // this means that if reading doesn't stop at the end of the first xref, then we'll try to look
+    // up the offset of the length object, which we don't yet have
+    OUString aSourceURL = m_directories.getURLFromSrc(DATA_DIRECTORY) + "comment-end.pdf";
+    SvFileStream aFile(aSourceURL, StreamMode::READ);
+    vcl::filter::PDFDocument aDocument;
+
+    // Without the accompanying fix in place, this test would have failed, because Tokenize() didn't
+    // stop at the end of the first xref.
+    CPPUNIT_ASSERT(aDocument.Read(aFile));
+}
+
 CPPUNIT_PLUGIN_IMPLEMENT();
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/vcl/source/filter/ipdf/pdfdocument.cxx b/vcl/source/filter/ipdf/pdfdocument.cxx
index 64cf9dc4ef90..8715000f1627 100644
--- a/vcl/source/filter/ipdf/pdfdocument.cxx
+++ b/vcl/source/filter/ipdf/pdfdocument.cxx
@@ -2145,9 +2145,14 @@ bool PDFCommentElement::Read(SvStream& rStream)
                 sal_uInt64 nPos = rStream.Tell();
                 if (ch == '\r')
                 {
+                    rStream.ReadChar(ch);
+                    rStream.SeekRel(-1);
                     // If the comment ends with a \r\n, count the \n as well to match Adobe Acrobat
                     // behavior.
-                    nPos += 1;
+                    if (ch == '\n')
+                    {
+                        nPos += 1;
+                    }
                 }
                 m_rDoc.PushBackEOF(nPos);
             }
author	Miklos Vajna <vmiklos@collabora.com>	2021-05-12 10:51:09 +0200
committer	Andras Timar <andras.timar@collabora.com>	2021-05-27 11:09:27 +0200
commit	5f00200b8d6cb095156b5ea963a8cad3bbd918a6 (patch)
tree	eb850e3e948d81e4bb5b9aed38cee8fd877438c6
parent	8def73a66021cccb241bf4916ef051932bd5730c (diff)