summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEike Rathke <erack@redhat.com>2012-04-14 18:57:31 +0200
committerEike Rathke <erack@redhat.com>2012-04-14 18:57:31 +0200
commit7928b651965f747b02593d2a9fc73fac7c86dbf5 (patch)
tree079cabd464d84456fc63a44849c402fee1ccd65b
parent95cc5de63b20c5986fe8f3913da86002eabd7cb1 (diff)
resolved fdo#48621 better handling of broken CSV files
* non-escaped (not doubled) quotes in quoted strings are regarded as broken representation and are taken literally, only a quote followed by a separator ends a field. If not being a separator themselves, trailing blanks between the ending quote and the separator are ignored, complementary to leading blanks between a separator and a quote. * quotes in a non-quoted string are taken literally
-rw-r--r--sc/source/ui/docshell/impex.cxx164
-rw-r--r--sc/source/ui/inc/impex.hxx17
2 files changed, 151 insertions, 30 deletions
diff --git a/sc/source/ui/docshell/impex.cxx b/sc/source/ui/docshell/impex.cxx
index f33a0d7f5499..cf51c0703dd7 100644
--- a/sc/source/ui/docshell/impex.cxx
+++ b/sc/source/ui/docshell/impex.cxx
@@ -573,6 +573,77 @@ void ScImportExport::WriteUnicodeOrByteEndl( SvStream& rStrm )
}
+enum QuoteType
+{
+ FIELDSTART_QUOTE,
+ FIRST_QUOTE,
+ SECOND_QUOTE,
+ FIELDEND_QUOTE,
+ DONTKNOW_QUOTE
+};
+
+
+/** Determine if *p is a quote that ends a quoted field.
+
+ Precondition: we are parsing a quoted field already and *p is a quote.
+
+ @return
+ FIELDEND_QUOTE if end of field quote
+ DONTKNOW_QUOTE anything else
+ */
+static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* pSeps )
+{
+ // Due to broken CSV generators that don't double embedded quotes check if
+ // a field separator immediately or with trailing spaces follows the quote,
+ // only then end the field, or at end of string.
+ while (p[1] == ' ')
+ ++p;
+ if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1]))
+ return FIELDEND_QUOTE;
+ return DONTKNOW_QUOTE;
+}
+
+
+/** Determine if *p is a quote that is escaped by being doubled or ends a
+ quoted field.
+
+ Precondition: *p is a quote.
+
+ @param nQuotes
+ Quote characters encountered so far.
+ Odd (after opening quote) means either no embedded quotes or only quote
+ pairs so far.
+ Even means either not in a quoted field or already one quote
+ encountered, the first of a pair.
+
+ @return
+ FIELDSTART_QUOTE if first quote in a field, either starting content or
+ embedded so caller should check beforehand.
+ FIRST_QUOTE if first of a doubled quote
+ SECOND_QUOTE if second of a doubled quote
+ FIELDEND_QUOTE if end of field quote
+ DONTKNOW_QUOTE if an unescaped quote we don't consider as end of field,
+ do not increment nQuotes in caller then!
+ */
+static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unicode* p,
+ const sal_Unicode* pSeps, sal_Unicode cStr )
+{
+ if ((nQuotes % 2) == 0)
+ {
+ if (p[-1] == cStr)
+ return SECOND_QUOTE;
+ else
+ {
+ SAL_WARN( "sc", "lcl_isEscapedOrFieldEndQuote: really want a FIELDSTART_QUOTE?");
+ return FIELDSTART_QUOTE;
+ }
+ }
+ if (p[1] == cStr)
+ return FIRST_QUOTE;
+ return lcl_isFieldEndQuote( p, pSeps);
+}
+
+
/** Append characters of [p1,p2) to rField.
@returns TRUE if ok; FALSE if data overflow, truncated
@@ -606,7 +677,7 @@ enum DoubledQuoteMode
};
static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, String& rString,
- sal_Unicode cStr, DoubledQuoteMode eMode, bool& rbOverflowCell )
+ const sal_Unicode* pSeps, sal_Unicode cStr, DoubledQuoteMode eMode, bool& rbOverflowCell )
{
p++; //! jump over opening quote
bool bCont;
@@ -621,7 +692,18 @@ static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, String& rString,
if( *p == cStr )
{
if ( *++p != cStr )
- break;
+ {
+ // break or continue for loop
+ if (eMode == DQM_ESCAPE)
+ {
+ if (lcl_isFieldEndQuote( p-1, pSeps) == FIELDEND_QUOTE)
+ break;
+ else
+ continue;
+ }
+ else
+ break;
+ }
// doubled quote char
switch ( eMode )
{
@@ -815,6 +897,10 @@ bool ScImportExport::Text2Doc( SvStream& rStrm )
{
bool bOk = true;
+ sal_Unicode pSeps[2];
+ pSeps[0] = cSep;
+ pSeps[1] = 0;
+
SCCOL nStartCol = aRange.aStart.Col();
SCROW nStartRow = aRange.aStart.Row();
SCCOL nEndCol = aRange.aEnd.Col();
@@ -843,7 +929,7 @@ bool ScImportExport::Text2Doc( SvStream& rStrm )
aCell.Erase();
if( *p == cStr )
{
- p = lcl_ScanString( p, aCell, cStr, DQM_KEEP, bOverflowCell );
+ p = lcl_ScanString( p, aCell, pSeps, cStr, DQM_KEEP, bOverflowCell );
while( *p && *p != cSep )
p++;
if( *p )
@@ -1277,7 +1363,7 @@ bool ScImportExport::ExtText2Doc( SvStream& rStrm )
for( ;; )
{
aLine = ReadCsvLine(rStrm, !bFixed, rSeps, cStr);
- if ( rStrm.IsEof() )
+ if ( rStrm.IsEof() && aLine.isEmpty() )
break;
sal_Int32 nLineLen = aLine.getLength();
@@ -1445,7 +1531,7 @@ const sal_Unicode* ScImportExport::ScanNextFieldFromString( const sal_Unicode* p
{
rbIsQuoted = true;
const sal_Unicode* p1;
- p1 = p = lcl_ScanString( p, rField, cStr, DQM_ESCAPE, rbOverflowCell );
+ p1 = p = lcl_ScanString( p, rField, pSeps, cStr, DQM_ESCAPE, rbOverflowCell );
while ( *p && !ScGlobal::UnicodeStrChr( pSeps, *p ) )
p++;
// Append remaining unquoted and undelimited data (dirty, dirty) to
@@ -2212,9 +2298,8 @@ inline const sal_Unicode* lcl_UnicodeStrChr( const sal_Unicode* pStr,
return 0;
}
-rtl::OUString ReadCsvLine(SvStream &rStream, bool bEmbeddedLineBreak,
- const String& rFieldSeparators, sal_Unicode cFieldQuote,
- bool bAllowBackslashEscape)
+rtl::OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
+ const String& rFieldSeparators, sal_Unicode cFieldQuote )
{
rtl::OUString aStr;
rStream.ReadUniOrByteStringLine(aStr, rStream.GetStreamCharSet(), nArbitraryLineLengthLimit);
@@ -2226,11 +2311,13 @@ rtl::OUString ReadCsvLine(SvStream &rStream, bool bEmbeddedLineBreak,
// See if the separator(s) include tab.
bool bTabSep = lcl_UnicodeStrChr(pSeps, '\t') != NULL;
+ QuoteType eQuoteState = FIELDEND_QUOTE;
+ bool bFieldStart = true;
+
sal_Int32 nLastOffset = 0;
sal_Int32 nQuotes = 0;
while (!rStream.IsEof() && aStr.getLength() < nArbitraryLineLengthLimit)
{
- bool bBackslashEscaped = false;
const sal_Unicode *p, *pStart;
p = pStart = aStr.getStr();
p += nLastOffset;
@@ -2248,25 +2335,66 @@ rtl::OUString ReadCsvLine(SvStream &rStream, bool bEmbeddedLineBreak,
break;
}
- if (*p == cFieldQuote && !bBackslashEscaped)
- ++nQuotes;
- else if (bAllowBackslashEscape)
+ if (*p == cFieldQuote)
+ {
+ if (bFieldStart)
+ {
+ ++nQuotes;
+ bFieldStart = false;
+ eQuoteState = FIELDSTART_QUOTE;
+ }
+ // Do not detect a FIELDSTART_QUOTE if not in
+ // bFieldStart mode, in which case for unquoted content
+ // we are in FIELDEND_QUOTE state.
+ else if (eQuoteState != FIELDEND_QUOTE)
+ {
+ eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote);
+ // DONTKNOW_QUOTE is an embedded unescaped quote we
+ // don't count for pairing.
+ if (eQuoteState != DONTKNOW_QUOTE)
+ ++nQuotes;
+ }
+ }
+ else if (eQuoteState == FIELDEND_QUOTE)
+ {
+ if (bFieldStart)
+ // If blank is a separator it starts a field, if it
+ // is not and thus maybe leading before quote we
+ // are still at start of field regarding quotes.
+ bFieldStart = (*p == ' ' || lcl_UnicodeStrChr( pSeps, *p) != NULL);
+ else
+ bFieldStart = (lcl_UnicodeStrChr( pSeps, *p) != NULL);
+ }
+ }
+ else
+ {
+ if (*p == cFieldQuote && bFieldStart)
+ {
+ nQuotes = 1;
+ eQuoteState = FIELDSTART_QUOTE;
+ bFieldStart = false;
+ }
+ else if (eQuoteState == FIELDEND_QUOTE)
{
- if (*p == '\\')
- bBackslashEscaped = !bBackslashEscaped;
+ // This also skips leading blanks at beginning of line
+ // if followed by a quote. It's debatable whether we
+ // actually want that or not, but congruent with what
+ // ScanNextFieldFromString() does.
+ if (bFieldStart)
+ bFieldStart = (*p == ' ' || lcl_UnicodeStrChr( pSeps, *p) != NULL);
else
- bBackslashEscaped = false;
+ bFieldStart = (lcl_UnicodeStrChr( pSeps, *p) != NULL);
}
}
- else if (*p == cFieldQuote && (p == pStart ||
- lcl_UnicodeStrChr( pSeps, p[-1])))
- nQuotes = 1;
// A quote character inside a field content does not start
// a quote.
++p;
}
if (nQuotes % 2 == 0)
+ // We still have a (theoretical?) problem here if due to
+ // nArbitraryLineLengthLimit we split a string right between a
+ // doubled quote pair.
break;
else
{
diff --git a/sc/source/ui/inc/impex.hxx b/sc/source/ui/inc/impex.hxx
index 03aff6878f2f..ed2644d2f9c4 100644
--- a/sc/source/ui/inc/impex.hxx
+++ b/sc/source/ui/inc/impex.hxx
@@ -208,11 +208,10 @@ public:
within a field, the field content MUST be surrounded by
cFieldQuote characters, and the opening cFieldQuote MUST be
at the very start of a line or follow right behind a field
- separator with no extra characters in between. Anything,
+ separator with no extra characters in between, with the
+ exception of blanks contradictory to RFC 4180. Anything,
including field separators and escaped quotes (by doubling
- them, or preceding them with a backslash if
- bAllowBackslashEscape==TRUE) may appear in a quoted
- field.
+ them) may appear in a quoted field.
If bEmbeddedLineBreak==FALSE, nothing is parsed and the
string returned is simply one ReadUniOrByteStringLine().
@@ -223,11 +222,6 @@ public:
@param cFieldQuote
The quote character used.
- @param bAllowBackslashEscape
- If TRUE, an embedded quote character inside a quoted
- field may also be escaped with a preceding backslash.
- Normally, quotes are escaped by doubling them.
-
check Stream::good() to detect IO problems during read
@ATTENTION
@@ -247,9 +241,8 @@ public:
may start under false preconditions.
*/
-SC_DLLPUBLIC rtl::OUString ReadCsvLine(SvStream &rStream, bool bEmbeddedLineBreak,
- const String& rFieldSeparators, sal_Unicode cFieldQuote,
- bool bAllowBackslashEscape = false);
+SC_DLLPUBLIC rtl::OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
+ const String& rFieldSeparators, sal_Unicode cFieldQuote );
#endif