summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMike Kaganski <mike.kaganski@collabora.com>2017-08-23 09:09:57 +0300
committerMike Kaganski <mike.kaganski@collabora.com>2017-08-23 13:20:32 +0200
commit5b518ab051cc04e672ceb01da42b06625a1a4ce9 (patch)
tree69402bec04d6d9620e406cf677adbcd3a2be458a
parentd239bf6d79e93f650a4241fcd2da0cb77c9cb95b (diff)
tdf#111964: only trim XML whitespace
OUString::trim() uses rtl_uString_newTrim, which relies upon rtl_ImplIsWhitespace. The latter treats as whitespaces not only characters with values less than or equal to 32, but also Unicode General Punctuation area Space and some Control characters. Thus, using OUString::trim() is incorrect when the goal is to trim XML whitespace, which is defined as one of 0x09, 0x0A, 0x0D, 0x20. The comments for OUString::trim() and rtl_uString_newTrim are corrected to describe which characters are considered whitespace. A unit test included. Change-Id: I45a132be923a52dcd5a4c35aeecb53d423b49fec Reviewed-on: https://gerrit.libreoffice.org/41444 Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com> Tested-by: Mike Kaganski <mike.kaganski@collabora.com>
-rw-r--r--include/rtl/ustring.h4
-rw-r--r--include/rtl/ustring.hxx4
-rw-r--r--sw/qa/extras/ooxmlexport/data/tdf111964.docxbin0 -> 1481 bytes
-rw-r--r--sw/qa/extras/ooxmlexport/ooxmlexport9.cxx10
-rw-r--r--writerfilter/source/ooxml/OOXMLFastContextHandler.cxx28
5 files changed, 43 insertions, 3 deletions
diff --git a/include/rtl/ustring.h b/include/rtl/ustring.h
index 831ecd66d9be..50dbd75a5ecc 100644
--- a/include/rtl/ustring.h
+++ b/include/rtl/ustring.h
@@ -2023,7 +2023,9 @@ SAL_DLLPUBLIC void SAL_CALL rtl_uString_newToAsciiUpperCase(
string.
The new string results from removing all characters with values less than
- or equal to 32 (the space character) form both ends of str.
+ or equal to 32 (the space character), and also Unicode General Punctuation
+ area Space and some Control characters, form both ends of str (see
+ rtl_ImplIsWhitespace).
This function cannot be used for language-specific conversion. The new
string does not necessarily have a reference count of 1 (in cases where
diff --git a/include/rtl/ustring.hxx b/include/rtl/ustring.hxx
index 602335e16768..c6ce9a73eb99 100644
--- a/include/rtl/ustring.hxx
+++ b/include/rtl/ustring.hxx
@@ -2947,7 +2947,9 @@ public:
of the string.
All characters that have codes less than or equal to
- 32 (the space character) are considered to be white space.
+ 32 (the space character), and Unicode General Punctuation area Space
+ and some Control characters are considered to be white space (see
+ rtl_ImplIsWhitespace).
If the string doesn't contain white spaces at both ends,
then the new string is assigned with str.
diff --git a/sw/qa/extras/ooxmlexport/data/tdf111964.docx b/sw/qa/extras/ooxmlexport/data/tdf111964.docx
new file mode 100644
index 000000000000..7cb85a1d87df
--- /dev/null
+++ b/sw/qa/extras/ooxmlexport/data/tdf111964.docx
Binary files differ
diff --git a/sw/qa/extras/ooxmlexport/ooxmlexport9.cxx b/sw/qa/extras/ooxmlexport/ooxmlexport9.cxx
index 0f5e9c6320e5..f9cb088d5068 100644
--- a/sw/qa/extras/ooxmlexport/ooxmlexport9.cxx
+++ b/sw/qa/extras/ooxmlexport/ooxmlexport9.cxx
@@ -936,6 +936,16 @@ DECLARE_OOXMLEXPORT_TEST(testTdf109184, "tdf109184.docx")
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0xff0000), getProperty<sal_Int32>(xCell3, "BackColor"));
}
+DECLARE_OOXMLEXPORT_TEST(testTdf111964, "tdf111964.docx")
+{
+ xmlDocPtr pXmlDoc = parseExport("word/document.xml");
+ if (!pXmlDoc)
+ return;
+ // Unicode spaces that are not XML whitespace must not be trimmed
+ const sal_Unicode sWSReference [] { 0x2002, 0x2002, 0x2002, 0x2002, 0x2002, 0 };
+ assertXPathContent(pXmlDoc, "/w:document/w:body/w:p/w:r[4]/w:t", sWSReference);
+}
+
CPPUNIT_PLUGIN_IMPLEMENT();
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx b/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx
index 9bbbce2af961..175e86435396 100644
--- a/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx
+++ b/writerfilter/source/ooxml/OOXMLFastContextHandler.cxx
@@ -588,6 +588,32 @@ void OOXMLFastContextHandler::endTxbxContent()
mpParserState->endTxbxContent();
}
+namespace {
+// XML schema defines white space as one of four characters:
+// #x9 (tab), #xA (line feed), #xD (carriage return), and #x20 (space)
+bool IsXMLWhitespace(sal_Unicode cChar)
+{
+ return cChar == 0x9 || cChar == 0xA || cChar == 0xD || cChar == 0x20;
+}
+
+OUString TrimXMLWhitespace(const OUString & sText)
+{
+ sal_Int32 nTrimmedStart = 0;
+ const sal_Int32 nLen = sText.getLength();
+ sal_Int32 nTrimmedEnd = nLen - 1;
+ while (nTrimmedStart < nLen && IsXMLWhitespace(sText[nTrimmedStart]))
+ ++nTrimmedStart;
+ while (nTrimmedStart <= nTrimmedEnd && IsXMLWhitespace(sText[nTrimmedEnd]))
+ --nTrimmedEnd;
+ if ((nTrimmedStart == 0) && (nTrimmedEnd == nLen - 1))
+ return sText;
+ else if (nTrimmedStart > nTrimmedEnd)
+ return OUString();
+ else
+ return sText.copy(nTrimmedStart, nTrimmedEnd-nTrimmedStart+1);
+}
+}
+
void OOXMLFastContextHandler::text(const OUString & sText)
{
if (isForwardEvents())
@@ -599,7 +625,7 @@ void OOXMLFastContextHandler::text(const OUString & sText)
// tabs are converted to spaces
if (!IsPreserveSpace())
{
- sNormalizedText = sNormalizedText.trim().replaceAll("\t", " ");
+ sNormalizedText = TrimXMLWhitespace(sNormalizedText).replaceAll("\t", " ");
}
mpStream->utext(reinterpret_cast < const sal_uInt8 * >
(sNormalizedText.getStr()),