summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDamjan Jovanovic <damjan@apache.org>2016-04-17 16:44:43 +0000
committerDamjan Jovanovic <damjan@apache.org>2016-04-17 16:44:43 +0000
commit60e93b8b5b6bc4220d66e95cd234a37f3c8f8fd7 (patch)
tree524b42673c7e665a1652f13410f0fe1480c3f7a9
parent8d94f9e44da8f3fdf55b1d814f921cb6dbdc37a3 (diff)
Make CSV line parsers consistent with CSV field parsers.
Our CSV field parsing algorithms treats fields starting with a quote (immediately at the beginning of the row, or after the field delimiter) as quoted. A quoted field ends at the corresponding closing quote, and any remaining text between the closing quote and the next field delimeter or end of line is appended to the text already extracted from the field, but not processed further. Any quotes in this extra text are taken verbatim - they do not quote anything. Our CSV line parsers were big hacks - they essentially read and concatenate lines until an even number of quote characters is found, and then feed this through the CSV field parsers. This patch rewrites the line parsers to work exactly how the field parsers work. Text such as: "another" ",something else is now correctly parsed by both Calc and Base as: [another "],[something else] instead of breaking all further parsing. Patch by: me
Notes
-rw-r--r--connectivity/source/drivers/flat/ETable.cxx62
-rw-r--r--tools/source/stream/stream.cxx59
2 files changed, 97 insertions, 24 deletions
diff --git a/connectivity/source/drivers/flat/ETable.cxx b/connectivity/source/drivers/flat/ETable.cxx
index 1620b645cd3c..a885f1fb6513 100644
--- a/connectivity/source/drivers/flat/ETable.cxx
+++ b/connectivity/source/drivers/flat/ETable.cxx
@@ -907,14 +907,64 @@ sal_Bool OFlatTable::readLine(QuotedTokenizedString& line, sal_Int32& _rnCurrent
return sal_False;
QuotedTokenizedString sLine = line; // check if the string continues on next line
- while( (sLine.GetString().GetTokenCount(m_cStringDelimiter) % 2) != 1 )
+ xub_StrLen nLastOffset = 0;
+ bool isQuoted = false;
+ bool isFieldStarting = true;
+ while (true)
{
- m_pFileStream->ReadByteStringLine(sLine,nEncoding);
- if ( !m_pFileStream->IsEof() )
+ bool wasQuote = false;
+ const sal_Unicode *p;
+ p = sLine.GetString().GetBuffer();
+ p += nLastOffset;
+
+ while (*p)
{
- line.GetString().Append('\n');
- line.GetString() += sLine.GetString();
- sLine = line;
+ if (isQuoted)
+ {
+ if (*p == m_cStringDelimiter)
+ wasQuote = !wasQuote;
+ else
+ {
+ if (wasQuote)
+ {
+ wasQuote = false;
+ isQuoted = false;
+ if (*p == m_cFieldDelimiter)
+ isFieldStarting = true;
+ }
+ }
+ }
+ else
+ {
+ if (isFieldStarting)
+ {
+ isFieldStarting = false;
+ if (*p == m_cStringDelimiter)
+ isQuoted = true;
+ else if (*p == m_cFieldDelimiter)
+ isFieldStarting = true;
+ }
+ else if (*p == m_cFieldDelimiter)
+ isFieldStarting = true;
+ }
+ ++p;
+ }
+
+ if (wasQuote)
+ isQuoted = false;
+
+ if (isQuoted)
+ {
+ nLastOffset = sLine.Len();
+ m_pFileStream->ReadByteStringLine(sLine,nEncoding);
+ if ( !m_pFileStream->IsEof() )
+ {
+ line.GetString().Append('\n');
+ line.GetString() += sLine.GetString();
+ sLine = line;
+ }
+ else
+ break;
}
else
break;
diff --git a/tools/source/stream/stream.cxx b/tools/source/stream/stream.cxx
index 93897942fd70..a0c8428bbd76 100644
--- a/tools/source/stream/stream.cxx
+++ b/tools/source/stream/stream.cxx
@@ -1128,38 +1128,59 @@ sal_Bool SvStream::ReadCsvLine( String& rStr, sal_Bool bEmbeddedLineBreak,
{
const sal_Unicode* pSeps = rFieldSeparators.GetBuffer();
xub_StrLen nLastOffset = 0;
- xub_StrLen nQuotes = 0;
+ bool isQuoted = false;
+ bool isFieldStarting = true;
while (!IsEof() && rStr.Len() < STRING_MAXLEN)
{
+ bool wasQuote = false;
bool bBackslashEscaped = false;
- const sal_Unicode *p, *pStart;
- p = pStart = rStr.GetBuffer();
+ const sal_Unicode *p;
+ p = rStr.GetBuffer();
p += nLastOffset;
while (*p)
{
- if (nQuotes)
+ if (isQuoted)
{
if (*p == cFieldQuote && !bBackslashEscaped)
- ++nQuotes;
- else if (bAllowBackslashEscape)
+ wasQuote = !wasQuote;
+ else
{
- if (*p == '\\')
- bBackslashEscaped = !bBackslashEscaped;
- else
- bBackslashEscaped = false;
+ if (bAllowBackslashEscape)
+ {
+ if (*p == '\\')
+ bBackslashEscaped = !bBackslashEscaped;
+ else
+ bBackslashEscaped = false;
+ }
+ if (wasQuote)
+ {
+ wasQuote = false;
+ isQuoted = false;
+ if (lcl_UnicodeStrChr( pSeps, *p ))
+ isFieldStarting = true;
+ }
}
}
- else if (*p == cFieldQuote && (p == pStart ||
- lcl_UnicodeStrChr( pSeps, p[-1])))
- nQuotes = 1;
- // A quote character inside a field content does not start
- // a quote.
+ else
+ {
+ if (isFieldStarting)
+ {
+ isFieldStarting = false;
+ if (*p == cFieldQuote)
+ isQuoted = true;
+ else if (lcl_UnicodeStrChr( pSeps, *p ))
+ isFieldStarting = true;
+ }
+ else if (lcl_UnicodeStrChr( pSeps, *p ))
+ isFieldStarting = true;
+ }
++p;
}
- if (nQuotes % 2 == 0)
- break;
- else
+ if (wasQuote)
+ isQuoted = false;
+
+ if (isQuoted)
{
nLastOffset = rStr.Len();
String aNext;
@@ -1167,6 +1188,8 @@ sal_Bool SvStream::ReadCsvLine( String& rStr, sal_Bool bEmbeddedLineBreak,
rStr += sal_Unicode(_LF);
rStr += aNext;
}
+ else
+ break;
}
}
return nError == SVSTREAM_OK;