diff options
author | Eike Rathke <erack@redhat.com> | 2021-08-18 19:05:08 +0200 |
---|---|---|
committer | Eike Rathke <erack@redhat.com> | 2021-08-19 17:27:51 +0200 |
commit | ff62e0165a0add7c7e3cb606df5b24b20c822d8a (patch) | |
tree | 1f10d9ce9c0851c4df726f6f06f641b5bea1994d /sc/source/ui/docshell | |
parent | 0d14e1251efeec9f651a28f553447c6d06b778e9 (diff) |
Resolves: tdf#102846 CSV: Detect separator, limit preview line concatenations
In CSV import preview, if a line starts with a quote character and
none of the remembered last field separators used occur in data in
conjunction with a closing quote, then reading data tried to
concatenate line by line to form a data field to be presented in
the preview, worst case the entire file..
For the preview, detect one possible not yet selected separator if
used with a quoted field (similar to commit
c807e7ea7a0725a4d8375eda07d6f70870e0d50a for tdf#56910 space
separator) and limit the number of source lines that are tried to
be concatenated if no separator was encountered after a possibly
closing field quote.
Change-Id: Iefd37a8301161e72cb607cea88d4faadad47b4ae
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/120690
Reviewed-by: Eike Rathke <erack@redhat.com>
Tested-by: Jenkins
Diffstat (limited to 'sc/source/ui/docshell')
-rw-r--r-- | sc/source/ui/docshell/impex.cxx | 50 |
1 files changed, 40 insertions, 10 deletions
diff --git a/sc/source/ui/docshell/impex.cxx b/sc/source/ui/docshell/impex.cxx index 3a798b37e50e..3c2772619dac 100644 --- a/sc/source/ui/docshell/impex.cxx +++ b/sc/source/ui/docshell/impex.cxx @@ -607,17 +607,40 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p // Due to broken CSV generators that don't double embedded quotes check if // a field separator immediately or with trailing spaces follows the quote, // only then end the field, or at end of string. - const sal_Unicode cBlank = ' '; + constexpr sal_Unicode cBlank = ' '; if (p[1] == cBlank && ScGlobal::UnicodeStrChr( pSeps, cBlank)) return FIELDEND_QUOTE; // Detect a possible blank separator if it's not already in the list (which // was checked right above for p[1]==cBlank). - if (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank) - rcDetectSep = cBlank; + const bool bBlankSep = (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank); while (p[1] == cBlank) ++p; if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1])) return FIELDEND_QUOTE; + // Extended separator detection after a closing quote (with or without + // blanks). Note that nQuotes is incremented *after* the call so is not yet + // even here, and that with separator detection we reach here only if + // lcl_isEscapedOrFieldEndQuote() did not already detect FIRST_QUOTE or + // SECOND_QUOTE for an escaped embedded quote, thus nQuotes does not have + // to be checked. + if (!rcDetectSep) + { + constexpr sal_Unicode vSep[] = { ',', '\t', ';' }; + for (const sal_Unicode c : vSep) + { + if (p[1] == c) + { + rcDetectSep = c; + return FIELDEND_QUOTE; + } + } + } + // Blank separator is least significant, after others. + if (bBlankSep) + { + rcDetectSep = cBlank; + return FIELDEND_QUOTE; + } return DONTKNOW_QUOTE; } @@ -645,7 +668,7 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unicode* p, const sal_Unicode* pSeps, sal_Unicode cStr, sal_Unicode& rcDetectSep ) { - if ((nQuotes % 2) == 0) + if ((nQuotes & 1) == 0) { if (p[-1] == cStr) return SECOND_QUOTE; @@ -2481,7 +2504,7 @@ ScImportStringStream::ScImportStringStream( const OUString& rStr ) } OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak, - OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep ) + OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep, sal_uInt32 nMaxSourceLines ) { enum RetryState { @@ -2506,6 +2529,8 @@ Label_RetryWithNewSep: if (bEmbeddedLineBreak) { + sal_uInt32 nLine = 0; + const sal_Unicode* pSeps = rFieldSeparators.getStr(); QuoteType eQuoteState = FIELDEND_QUOTE; @@ -2544,10 +2569,11 @@ Label_RetryWithNewSep: { eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote, rcDetectSep); - if (eRetryState == RetryState::ALLOW && rcDetectSep == ' ') + if (eRetryState == RetryState::ALLOW && rcDetectSep) { eRetryState = RetryState::RETRY; - rFieldSeparators += " "; + rFieldSeparators += OUStringChar(rcDetectSep); + pSeps = rFieldSeparators.getStr(); goto Label_RetryWithNewSep; } @@ -2593,10 +2619,14 @@ Label_RetryWithNewSep: ++p; } - if (nQuotes % 2 == 0) + if ((nQuotes & 1) == 0) // We still have a (theoretical?) problem here if due to - // nArbitraryLineLengthLimit we split a string right between a - // doubled quote pair. + // nArbitraryLineLengthLimit (or nMaxSourceLines below) we + // split a string right between a doubled quote pair. + break; + else if (++nLine >= nMaxSourceLines && nMaxSourceLines > 0) + // Unconditionally increment nLine even if nMaxSourceLines==0 + // so it can be observed in debugger. break; else { |