From 35d248cab1f0d4800f72abb5cb6afb56f40d9083 Mon Sep 17 00:00:00 2001 From: Michael Stahl Date: Mon, 28 Oct 2019 14:31:23 +0100 Subject: svl: HTMLParser: stop inserting control character garbage into Writer E.g. rhbz433940-1.html contains literal ^G characters that are inserted as-is into SwTextNodes. This now triggers assert about CH_TXT_ATR_FIELDSTART in SwSubFont::GetTextSize_() that was added in 19a559b0ec9b806519c405651d6d2b2e14712b4a. Change-Id: I6aa7de41a04069e15b40865fd57894dae0fc10db Reviewed-on: https://gerrit.libreoffice.org/81606 Reviewed-by: Michael Stahl Tested-by: Michael Stahl --- svtools/source/svhtml/parhtml.cxx | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'svtools') diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx index a048ab92d923..6ce236566e61 100644 --- a/svtools/source/svhtml/parhtml.cxx +++ b/svtools/source/svhtml/parhtml.cxx @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -454,8 +455,12 @@ HtmlTokenId HTMLParser::ScanText( const sal_Unicode cBreak ) else nNextCh = 0U; - if ( ! rtl::isUnicodeCodePoint( cChar ) ) + if (!rtl::isUnicodeCodePoint(cChar) + || (linguistic::IsControlChar(cChar) + && cChar != '\r' && cChar != '\n' && cChar != '\t')) + { cChar = '?'; + } } else if( rtl::isAsciiAlpha( nNextCh ) ) { @@ -751,8 +756,11 @@ HtmlTokenId HTMLParser::ScanText( const sal_Unicode cBreak ) else { do { + if (!linguistic::IsControlChar(nNextCh)) + { // All remaining characters make their way into the text. - sTmpBuffer.appendUtf32( nNextCh ); + sTmpBuffer.appendUtf32( nNextCh ); + } if( MAX_LEN == sTmpBuffer.getLength() ) { aToken += sTmpBuffer; @@ -987,8 +995,11 @@ HtmlTokenId HTMLParser::GetNextRawToken() } [[fallthrough]]; default: - // all remaining characters are appended to the buffer - sTmpBuffer.appendUtf32( nNextCh ); + if (!linguistic::IsControlChar(nNextCh) || nNextCh == '\t') + { + // all remaining characters are appended to the buffer + sTmpBuffer.appendUtf32( nNextCh ); + } break; } -- cgit v1.2.3