summaryrefslogtreecommitdiff
path: root/sc/source/ui/docshell
diff options
context:
space:
mode:
authorEike Rathke <erack@redhat.com>2021-08-18 19:05:08 +0200
committerEike Rathke <erack@redhat.com>2021-08-19 17:27:51 +0200
commitff62e0165a0add7c7e3cb606df5b24b20c822d8a (patch)
tree1f10d9ce9c0851c4df726f6f06f641b5bea1994d /sc/source/ui/docshell
parent0d14e1251efeec9f651a28f553447c6d06b778e9 (diff)
Resolves: tdf#102846 CSV: Detect separator, limit preview line concatenations
In CSV import preview, if a line starts with a quote character and none of the remembered last field separators used occur in data in conjunction with a closing quote, then reading data tried to concatenate line by line to form a data field to be presented in the preview, worst case the entire file.. For the preview, detect one possible not yet selected separator if used with a quoted field (similar to commit c807e7ea7a0725a4d8375eda07d6f70870e0d50a for tdf#56910 space separator) and limit the number of source lines that are tried to be concatenated if no separator was encountered after a possibly closing field quote. Change-Id: Iefd37a8301161e72cb607cea88d4faadad47b4ae Reviewed-on: https://gerrit.libreoffice.org/c/core/+/120690 Reviewed-by: Eike Rathke <erack@redhat.com> Tested-by: Jenkins
Diffstat (limited to 'sc/source/ui/docshell')
-rw-r--r--sc/source/ui/docshell/impex.cxx50
1 files changed, 40 insertions, 10 deletions
diff --git a/sc/source/ui/docshell/impex.cxx b/sc/source/ui/docshell/impex.cxx
index 3a798b37e50e..3c2772619dac 100644
--- a/sc/source/ui/docshell/impex.cxx
+++ b/sc/source/ui/docshell/impex.cxx
@@ -607,17 +607,40 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p
// Due to broken CSV generators that don't double embedded quotes check if
// a field separator immediately or with trailing spaces follows the quote,
// only then end the field, or at end of string.
- const sal_Unicode cBlank = ' ';
+ constexpr sal_Unicode cBlank = ' ';
if (p[1] == cBlank && ScGlobal::UnicodeStrChr( pSeps, cBlank))
return FIELDEND_QUOTE;
// Detect a possible blank separator if it's not already in the list (which
// was checked right above for p[1]==cBlank).
- if (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank)
- rcDetectSep = cBlank;
+ const bool bBlankSep = (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank);
while (p[1] == cBlank)
++p;
if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1]))
return FIELDEND_QUOTE;
+ // Extended separator detection after a closing quote (with or without
+ // blanks). Note that nQuotes is incremented *after* the call so is not yet
+ // even here, and that with separator detection we reach here only if
+ // lcl_isEscapedOrFieldEndQuote() did not already detect FIRST_QUOTE or
+ // SECOND_QUOTE for an escaped embedded quote, thus nQuotes does not have
+ // to be checked.
+ if (!rcDetectSep)
+ {
+ constexpr sal_Unicode vSep[] = { ',', '\t', ';' };
+ for (const sal_Unicode c : vSep)
+ {
+ if (p[1] == c)
+ {
+ rcDetectSep = c;
+ return FIELDEND_QUOTE;
+ }
+ }
+ }
+ // Blank separator is least significant, after others.
+ if (bBlankSep)
+ {
+ rcDetectSep = cBlank;
+ return FIELDEND_QUOTE;
+ }
return DONTKNOW_QUOTE;
}
@@ -645,7 +668,7 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p
static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unicode* p,
const sal_Unicode* pSeps, sal_Unicode cStr, sal_Unicode& rcDetectSep )
{
- if ((nQuotes % 2) == 0)
+ if ((nQuotes & 1) == 0)
{
if (p[-1] == cStr)
return SECOND_QUOTE;
@@ -2481,7 +2504,7 @@ ScImportStringStream::ScImportStringStream( const OUString& rStr )
}
OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
- OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep )
+ OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep, sal_uInt32 nMaxSourceLines )
{
enum RetryState
{
@@ -2506,6 +2529,8 @@ Label_RetryWithNewSep:
if (bEmbeddedLineBreak)
{
+ sal_uInt32 nLine = 0;
+
const sal_Unicode* pSeps = rFieldSeparators.getStr();
QuoteType eQuoteState = FIELDEND_QUOTE;
@@ -2544,10 +2569,11 @@ Label_RetryWithNewSep:
{
eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote, rcDetectSep);
- if (eRetryState == RetryState::ALLOW && rcDetectSep == ' ')
+ if (eRetryState == RetryState::ALLOW && rcDetectSep)
{
eRetryState = RetryState::RETRY;
- rFieldSeparators += " ";
+ rFieldSeparators += OUStringChar(rcDetectSep);
+ pSeps = rFieldSeparators.getStr();
goto Label_RetryWithNewSep;
}
@@ -2593,10 +2619,14 @@ Label_RetryWithNewSep:
++p;
}
- if (nQuotes % 2 == 0)
+ if ((nQuotes & 1) == 0)
// We still have a (theoretical?) problem here if due to
- // nArbitraryLineLengthLimit we split a string right between a
- // doubled quote pair.
+ // nArbitraryLineLengthLimit (or nMaxSourceLines below) we
+ // split a string right between a doubled quote pair.
+ break;
+ else if (++nLine >= nMaxSourceLines && nMaxSourceLines > 0)
+ // Unconditionally increment nLine even if nMaxSourceLines==0
+ // so it can be observed in debugger.
break;
else
{