summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/rtl/character.hxx13
-rw-r--r--include/svtools/svparser.hxx4
-rw-r--r--svtools/source/svhtml/parhtml.cxx53
-rw-r--r--svtools/source/svrtf/parrtf.cxx6
-rw-r--r--svtools/source/svrtf/svparser.cxx29
-rw-r--r--sw/qa/extras/htmlexport/data/extb.html10
-rw-r--r--sw/qa/extras/htmlexport/htmlexport.cxx13
7 files changed, 90 insertions, 38 deletions
diff --git a/include/rtl/character.hxx b/include/rtl/character.hxx
index a3d09b9b0df7..49f6803821de 100644
--- a/include/rtl/character.hxx
+++ b/include/rtl/character.hxx
@@ -222,6 +222,19 @@ sal_uInt32 const surrogatesLowLast = 0xDFFF;
}
/// @endcond
+/** Check if a codepoint is accessible via utf16 per RFC3629
+
+ @param code A non-BMP Unicode code point.
+
+ @return True if the code is a valid codepoint.
+
+ @since LibreOffice 5.2
+*/
+inline bool isValidCodePoint( sal_uInt32 code)
+{
+ return code <= 0x10FFFF;
+}
+
/** Check for high surrogate.
@param code A Unicode code point.
diff --git a/include/svtools/svparser.hxx b/include/svtools/svparser.hxx
index 3f60a4019613..cfbd1152a625 100644
--- a/include/svtools/svparser.hxx
+++ b/include/svtools/svparser.hxx
@@ -59,7 +59,7 @@ protected:
rtl_TextEncoding eSrcEnc; // Source encoding
sal_uLong nNextChPos;
- sal_Unicode nNextCh; // current character for the "lex"
+ sal_uInt32 nNextCh; // current character codepoint in UTF32 for the "lex"
bool bDownloadingFile : 1; // true: An external file is
@@ -128,7 +128,7 @@ public:
inline void SetLineNr( sal_uLong nlNum ); // inline bottom
inline void SetLinePos( sal_uLong nlPos ); // inline bottom
- sal_Unicode GetNextChar();
+ sal_uInt32 GetNextChar(); // Return next Unicode codepoint in UTF32.
void RereadLookahead();
inline bool IsParserWorking() const { return SVPAR_WORKING == eState; }
diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx
index 801e4e00ae68..a8eff6d0158f 100644
--- a/svtools/source/svhtml/parhtml.cxx
+++ b/svtools/source/svhtml/parhtml.cxx
@@ -25,6 +25,7 @@
#include <tools/color.hxx>
#include <rtl/ustrbuf.hxx>
#include <rtl/strbuf.hxx>
+#include <rtl/character.hxx>
#include <tools/tenccvt.hxx>
#include <tools/datetime.hxx>
@@ -429,7 +430,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
OUStringBuffer sTmpBuffer( MAX_LEN );
bool bContinue = true;
bool bEqSignFound = false;
- sal_Unicode cQuote = 0U;
+ sal_uInt32 cQuote = 0U;
while( bContinue && IsParserWorking() )
{
@@ -445,7 +446,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
sal_uLong nStreamPos = rInput.Tell();
sal_uLong nLinePos = GetLinePos();
- sal_Unicode cChar = 0U;
+ sal_uInt32 cChar = 0U;
if( '#' == (nNextCh = GetNextChar()) )
{
nNextCh = GetNextChar();
@@ -460,10 +461,10 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
{
cChar = cChar * 16U +
( nNextCh <= '9'
- ? sal_Unicode( nNextCh - '0' )
+ ? sal_uInt32( nNextCh - '0' )
: ( nNextCh <= 'F'
- ? sal_Unicode( nNextCh - 'A' + 10 )
- : sal_Unicode( nNextCh - 'a' + 10 ) ) );
+ ? sal_uInt32( nNextCh - 'A' + 10 )
+ : sal_uInt32( nNextCh - 'a' + 10 ) ) );
nNextCh = GetNextChar();
}
}
@@ -471,7 +472,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
{
do
{
- cChar = cChar * 10U + sal_Unicode( nNextCh - '0');
+ cChar = cChar * 10U + sal_uInt32( nNextCh - '0');
nNextCh = GetNextChar();
}
while( HTML_ISDIGIT(nNextCh) );
@@ -500,6 +501,9 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
}
else
nNextCh = 0U;
+
+ if ( ! rtl::isValidCodePoint( cChar ) )
+ cChar = '?';
}
else if( HTML_ISALPHA( nNextCh ) )
{
@@ -507,7 +511,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
sal_Int32 nPos = 0L;
do
{
- sEntityBuffer.append( nNextCh );
+ sEntityBuffer.appendUtf32( nNextCh );
nPos++;
nNextCh = GetNextChar();
}
@@ -637,7 +641,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
if( IsParserWorking() )
{
if( cChar )
- sTmpBuffer.append( cChar );
+ sTmpBuffer.appendUtf32( cChar );
}
else if( SVPAR_PENDING==eState && '>'!=cBreak )
{
@@ -661,7 +665,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
case '=':
if( '>'==cBreak && !cQuote )
bEqSignFound = true;
- sTmpBuffer.append( nNextCh );
+ sTmpBuffer.appendUtf32( nNextCh );
break;
case '\\':
@@ -684,7 +688,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
else if( cQuote && (cQuote==nNextCh ) )
cQuote = 0U;
}
- sTmpBuffer.append( nNextCh );
+ sTmpBuffer.appendUtf32( nNextCh );
bEqSignFound = false;
break;
@@ -695,14 +699,15 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
}
else
{
- sTmpBuffer.append( nNextCh );
+ sTmpBuffer.appendUtf32( nNextCh );
}
+
break;
case '<':
bEqSignFound = false;
if( '>'==cBreak )
- sTmpBuffer.append( nNextCh );
+ sTmpBuffer.appendUtf32( nNextCh );
else
bContinue = false; // break, String zusammen
break;
@@ -725,7 +730,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
if( '>'==cBreak )
{
// cr/lf in tag is handled in _GetNextToken()
- sTmpBuffer.append( nNextCh );
+ sTmpBuffer.appendUtf32( nNextCh );
break;
}
else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
@@ -752,7 +757,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
nNextCh = ' ';
// no break;
case ' ':
- sTmpBuffer.append( nNextCh );
+ sTmpBuffer.appendUtf32( nNextCh );
if( '>'!=cBreak && (!bReadListing && !bReadXMP &&
!bReadPRE && !bReadTextArea) )
{
@@ -787,7 +792,7 @@ int HTMLParser::ScanText( const sal_Unicode cBreak )
{
do {
// All remaining characters make their way into the text.
- sTmpBuffer.append( nNextCh );
+ sTmpBuffer.appendUtf32( nNextCh );
if( MAX_LEN == sTmpBuffer.getLength() )
{
aToken += sTmpBuffer.makeStringAndClear();
@@ -864,7 +869,7 @@ int HTMLParser::_GetNextRawToken()
}
else if( '!' == nNextCh )
{
- sTmpBuffer.append( nNextCh );
+ sTmpBuffer.appendUtf32( nNextCh );
nNextCh = GetNextChar();
}
@@ -872,7 +877,7 @@ int HTMLParser::_GetNextRawToken()
while( (HTML_ISALPHA(nNextCh) || '-'==nNextCh) &&
IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
{
- sTmpBuffer.append( nNextCh );
+ sTmpBuffer.appendUtf32( nNextCh );
nNextCh = GetNextChar();
}
@@ -959,7 +964,7 @@ int HTMLParser::_GetNextRawToken()
}
break;
case '-':
- sTmpBuffer.append( nNextCh );
+ sTmpBuffer.appendUtf32( nNextCh );
if( bReadComment )
{
bool bTwoMinus = false;
@@ -970,7 +975,7 @@ int HTMLParser::_GetNextRawToken()
if( MAX_LEN == sTmpBuffer.getLength() )
aToken += sTmpBuffer.makeStringAndClear();
- sTmpBuffer.append( nNextCh );
+ sTmpBuffer.appendUtf32( nNextCh );
nNextCh = GetNextChar();
}
@@ -1015,7 +1020,7 @@ int HTMLParser::_GetNextRawToken()
// no break
default:
// all remaining characters are appended to the buffer
- sTmpBuffer.append( nNextCh );
+ sTmpBuffer.appendUtf32( nNextCh );
break;
}
@@ -1095,7 +1100,7 @@ int HTMLParser::_GetNextToken()
{
OUStringBuffer sTmpBuffer;
do {
- sTmpBuffer.append( nNextCh );
+ sTmpBuffer.appendUtf32( nNextCh );
if( MAX_LEN == sTmpBuffer.getLength() )
aToken += sTmpBuffer.makeStringAndClear();
nNextCh = GetNextChar();
@@ -1166,10 +1171,10 @@ int HTMLParser::_GetNextToken()
}
bDone = aToken.endsWith( "--" );
if( !bDone )
- aToken += OUString(nNextCh);
+ aToken += OUString(&nNextCh,1);
}
else
- aToken += OUString(nNextCh);
+ aToken += OUString(&nNextCh,1);
if( !bDone )
nNextCh = GetNextChar();
}
@@ -1261,7 +1266,7 @@ int HTMLParser::_GetNextToken()
bDone = '>'==nNextCh && aToken.endsWith("%");
if( !bDone )
{
- aToken += OUString(nNextCh);
+ aToken += OUString(&nNextCh,1);
nNextCh = GetNextChar();
}
}
diff --git a/svtools/source/svrtf/parrtf.cxx b/svtools/source/svrtf/parrtf.cxx
index f6f75eb73162..bdc73d363970 100644
--- a/svtools/source/svrtf/parrtf.cxx
+++ b/svtools/source/svrtf/parrtf.cxx
@@ -191,7 +191,7 @@ int SvRTFParser::_GetNextToken()
// can be also \{, \}, \'88
for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
{
- sal_Unicode cAnsi = nNextCh;
+ sal_uInt32 cAnsi = nNextCh;
while( 0xD == cAnsi )
cAnsi = GetNextChar();
while( 0xA == cAnsi )
@@ -382,7 +382,7 @@ void SvRTFParser::ScanText( const sal_Unicode cBreak )
case '}':
case '{':
case '+': // I found in a RTF file
- aStrBuffer.append(nNextCh);
+ aStrBuffer.append(sal_Unicode(nNextCh));
break;
case '~': // nonbreaking space
aStrBuffer.append(static_cast< sal_Unicode >(0xA0));
@@ -484,7 +484,7 @@ void SvRTFParser::ScanText( const sal_Unicode cBreak )
{
do {
// all other characters end up in the text
- aStrBuffer.append(nNextCh);
+ aStrBuffer.appendUtf32(nNextCh);
if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
{
diff --git a/svtools/source/svrtf/svparser.cxx b/svtools/source/svrtf/svparser.cxx
index b5c377b72ea0..b862e66766ca 100644
--- a/svtools/source/svrtf/svparser.cxx
+++ b/svtools/source/svrtf/svparser.cxx
@@ -22,6 +22,7 @@
#include <tools/debug.hxx>
#include <rtl/textcvt.h>
#include <rtl/tencinfo.h>
+#include <rtl/character.hxx>
#include <vector>
@@ -35,7 +36,7 @@ struct SvParser_Impl
long nTokenValue; // extra value (RTF)
bool bTokenHasValue; // indicates whether nTokenValue is valid
int nToken; // actual Token
- sal_Unicode nNextCh; // actual character
+ sal_uInt32 nNextCh; // actual character
int nSaveToken; // the token from Continue
rtl_TextToUnicodeConverter hConv;
@@ -148,9 +149,9 @@ void SvParser::RereadLookahead()
nNextCh = GetNextChar();
}
-sal_Unicode SvParser::GetNextChar()
+sal_uInt32 SvParser::GetNextChar()
{
- sal_Unicode c = 0U;
+ sal_uInt32 c = 0U;
// When reading multiple bytes, we don't have to care about the file
// position when we run into the pending state. The file position is
@@ -257,7 +258,7 @@ sal_Unicode SvParser::GetNextChar()
)
{
// no convserion shall take place
- c = (sal_Unicode)c1;
+ c = reinterpret_cast<sal_uChar&>( c1 );
nChars = 1;
}
else
@@ -280,6 +281,7 @@ sal_Unicode SvParser::GetNextChar()
// read enough characters.
if( pImplData->hContext != reinterpret_cast<rtl_TextToUnicodeContext>(1) )
{
+ sal_Unicode sCh[2];
while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL) != 0 )
{
rInput.ReadChar( c1 );
@@ -289,7 +291,7 @@ sal_Unicode SvParser::GetNextChar()
nChars = rtl_convertTextToUnicode(
pImplData->hConv, pImplData->hContext,
- &c1, 1, &cUC, 1,
+ &c1, 1, sCh , 2,
RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
@@ -299,7 +301,11 @@ sal_Unicode SvParser::GetNextChar()
{
if( 1 == nChars && 0 == nInfo )
{
- c = cUC;
+ c = sal_uInt32( sCh[0] );
+ }
+ else if( 2 == nChars && 0 == nInfo )
+ {
+ c = rtl::combineSurrogates( sCh[0], sCh[1] );
}
else if( 0 != nChars || 0 != nInfo )
{
@@ -311,7 +317,7 @@ sal_Unicode SvParser::GetNextChar()
"there is a converted character, but an error" );
// There are still errors, but nothing we can
// do
- c = (sal_Unicode)'?';
+ c = (sal_uInt32)'?';
nChars = 1;
}
}
@@ -356,7 +362,7 @@ sal_Unicode SvParser::GetNextChar()
// There are still errors, so we use the first
// character and restart after that.
- c = (sal_Unicode)sBuffer[0];
+ c = reinterpret_cast<sal_uChar&>( sBuffer[0] );
rInput.SeekRel( -(nLen-1) );
nChars = 1;
}
@@ -378,7 +384,7 @@ sal_Unicode SvParser::GetNextChar()
"there is no converted character and no error" );
// #73398#: If the character could not be converted,
// because a conversion is not available, do no conversion at all.
- c = (sal_Unicode)c1;
+ c = reinterpret_cast<sal_uChar&>( c1 );
nChars = 1;
}
@@ -387,6 +393,10 @@ sal_Unicode SvParser::GetNextChar()
}
while( 0 == nChars && !bErr );
}
+
+ if ( ! rtl::isValidCodePoint( c ) )
+ c = (sal_uInt32) '?' ;
+
if( bErr )
{
if( ERRCODE_IO_PENDING == rInput.GetError() )
@@ -405,6 +415,7 @@ sal_Unicode SvParser::GetNextChar()
}
else
IncLinePos();
+
return c;
}
diff --git a/sw/qa/extras/htmlexport/data/extb.html b/sw/qa/extras/htmlexport/data/extb.html
new file mode 100644
index 000000000000..be73feadf89d
--- /dev/null
+++ b/sw/qa/extras/htmlexport/data/extb.html
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html>
+<head>
+<meta charset="UTF-8"/>
+</head>
+<body>
+<p>𤭢</p>
+<p>&#x24b62;</p>
+</body>
+</html>
diff --git a/sw/qa/extras/htmlexport/htmlexport.cxx b/sw/qa/extras/htmlexport/htmlexport.cxx
index f951a0a57006..69b6b7db6c54 100644
--- a/sw/qa/extras/htmlexport/htmlexport.cxx
+++ b/sw/qa/extras/htmlexport/htmlexport.cxx
@@ -272,6 +272,19 @@ DECLARE_HTMLEXPORT_TEST(testTdf83890, "tdf83890.odt")
assertXPath(pDoc, "/html/body/ol[2]/ol", "start", "2");
}
+DECLARE_HTMLEXPORT_TEST(testExtbChars, "extb.html")
+{
+ sal_uInt32 nCh = 0x24b62;
+ OUString aExpected( &nCh, 1);
+ // Assert that UTF8 encoded non-BMP Unicode character is correct
+ uno::Reference<text::XTextRange> xTextRange1 = getRun(getParagraph(1), 1);
+ CPPUNIT_ASSERT_EQUAL(aExpected, xTextRange1->getString());
+
+ // Assert that non-BMP Unicode in character entity format is correct
+ uno::Reference<text::XTextRange> xTextRange2 = getRun(getParagraph(2), 1);
+ CPPUNIT_ASSERT_EQUAL(aExpected, xTextRange2->getString());
+}
+
CPPUNIT_PLUGIN_IMPLEMENT();
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */