summaryrefslogtreecommitdiff
path: root/sw
diff options
context:
space:
mode:
authorMark Hung <marklh9@gmail.com>2015-12-27 00:46:49 +0800
committerMark Hung <marklh9@gmail.com>2016-02-13 08:05:09 +0000
commit4647e778993250b8c9431e2890750916fb986ecc (patch)
tree99d285ec6a33aeca2d9df32d30d2aea801066a37 /sw
parent3596613153289dae204b5abdc7446b303021f597 (diff)
tdf#81129 Support reading non-BMP characters in HTML documents.
1. Allow character entity ( &#nnnn; ) to exceed 0xffff in HTMLParser::ScanText() 2. Return a character as sal_uInt32 ( utf32 ) instead of sal_Unicode ( utf16 ) from SvParser::GetNextChar(). Conflicts: sw/qa/extras/htmlexport/htmlexport.cxx Change-Id: Ida455040970fae800f0f11471b27f53461fb78e4 Reviewed-on: https://gerrit.libreoffice.org/21152 Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Mark Hung <marklh9@gmail.com>
Diffstat (limited to 'sw')
-rw-r--r--sw/qa/extras/htmlexport/data/extb.html10
-rw-r--r--sw/qa/extras/htmlexport/htmlexport.cxx13
2 files changed, 23 insertions, 0 deletions
diff --git a/sw/qa/extras/htmlexport/data/extb.html b/sw/qa/extras/htmlexport/data/extb.html
new file mode 100644
index 000000000000..be73feadf89d
--- /dev/null
+++ b/sw/qa/extras/htmlexport/data/extb.html
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html>
+<head>
+<meta charset="UTF-8"/>
+</head>
+<body>
+<p>𤭢</p>
+<p>&#x24b62;</p>
+</body>
+</html>
diff --git a/sw/qa/extras/htmlexport/htmlexport.cxx b/sw/qa/extras/htmlexport/htmlexport.cxx
index f951a0a57006..69b6b7db6c54 100644
--- a/sw/qa/extras/htmlexport/htmlexport.cxx
+++ b/sw/qa/extras/htmlexport/htmlexport.cxx
@@ -272,6 +272,19 @@ DECLARE_HTMLEXPORT_TEST(testTdf83890, "tdf83890.odt")
assertXPath(pDoc, "/html/body/ol[2]/ol", "start", "2");
}
+DECLARE_HTMLEXPORT_TEST(testExtbChars, "extb.html")
+{
+ sal_uInt32 nCh = 0x24b62;
+ OUString aExpected( &nCh, 1);
+ // Assert that UTF8 encoded non-BMP Unicode character is correct
+ uno::Reference<text::XTextRange> xTextRange1 = getRun(getParagraph(1), 1);
+ CPPUNIT_ASSERT_EQUAL(aExpected, xTextRange1->getString());
+
+ // Assert that non-BMP Unicode in character entity format is correct
+ uno::Reference<text::XTextRange> xTextRange2 = getRun(getParagraph(2), 1);
+ CPPUNIT_ASSERT_EQUAL(aExpected, xTextRange2->getString());
+}
+
CPPUNIT_PLUGIN_IMPLEMENT();
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */